Flutter Engine
The Flutter Engine
regexp_parser.cc
Go to the documentation of this file.
1// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#include "vm/regexp_parser.h"
6
7#include "unicode/uchar.h"
8#include "unicode/uniset.h"
9
10#include "platform/unicode.h"
11
12#include "vm/longjump.h"
13#include "vm/object_store.h"
14#include "vm/symbols.h"
15
16namespace dart {
17
18#define Z zone()
19
20// Enables possessive quantifier syntax for testing.
21static constexpr bool FLAG_regexp_possessive_quantifier = false;
22
24 : zone_(Thread::Current()->zone()),
25 pending_empty_(false),
26 flags_(flags),
27 characters_(nullptr),
28 pending_surrogate_(kNoPendingSurrogate),
29 terms_(),
30 text_(),
31 alternatives_()
32#ifdef DEBUG
33 ,
34 last_added_(ADD_NONE)
35#endif
36{
37}
38
39void RegExpBuilder::AddLeadSurrogate(uint16_t lead_surrogate) {
40 ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
41 FlushPendingSurrogate();
42 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
43 pending_surrogate_ = lead_surrogate;
44}
45
46void RegExpBuilder::AddTrailSurrogate(uint16_t trail_surrogate) {
47 ASSERT(Utf16::IsTrailSurrogate(trail_surrogate));
48 if (pending_surrogate_ != kNoPendingSurrogate) {
49 uint16_t lead_surrogate = pending_surrogate_;
50 pending_surrogate_ = kNoPendingSurrogate;
51 ASSERT(Utf16::IsLeadSurrogate(lead_surrogate));
52 uint32_t combined = Utf16::Decode(lead_surrogate, trail_surrogate);
53 if (NeedsDesugaringForIgnoreCase(combined)) {
55 } else {
56 auto surrogate_pair = new (Z) ZoneGrowableArray<uint16_t>(2);
57 surrogate_pair->Add(lead_surrogate);
58 surrogate_pair->Add(trail_surrogate);
59 RegExpAtom* atom = new (Z) RegExpAtom(surrogate_pair, flags_);
60 AddAtom(atom);
61 }
62 } else {
63 pending_surrogate_ = trail_surrogate;
64 FlushPendingSurrogate();
65 }
66}
67
68void RegExpBuilder::FlushPendingSurrogate() {
69 if (pending_surrogate_ != kNoPendingSurrogate) {
70 ASSERT(is_unicode());
71 uint32_t c = pending_surrogate_;
72 pending_surrogate_ = kNoPendingSurrogate;
74 }
75}
76
77void RegExpBuilder::FlushCharacters() {
78 FlushPendingSurrogate();
79 pending_empty_ = false;
80 if (characters_ != nullptr) {
81 RegExpTree* atom = new (Z) RegExpAtom(characters_, flags_);
82 characters_ = nullptr;
83 text_.Add(atom);
84 LAST(ADD_ATOM);
85 }
86}
87
88void RegExpBuilder::FlushText() {
89 FlushCharacters();
90 intptr_t num_text = text_.length();
91 if (num_text == 0) {
92 return;
93 } else if (num_text == 1) {
94 terms_.Add(text_.Last());
95 } else {
96 RegExpText* text = new (Z) RegExpText();
97 for (intptr_t i = 0; i < num_text; i++)
98 text_[i]->AppendToText(text);
99 terms_.Add(text);
100 }
101 text_.Clear();
102}
103
105 FlushPendingSurrogate();
106 pending_empty_ = false;
107 if (NeedsDesugaringForIgnoreCase(c)) {
109 } else {
110 if (characters_ == nullptr) {
111 characters_ = new (Z) ZoneGrowableArray<uint16_t>(4);
112 }
113 characters_->Add(c);
114 LAST(ADD_CHAR);
115 }
116}
117
119 if (c > static_cast<uint32_t>(Utf16::kMaxCodeUnit)) {
120 ASSERT(is_unicode());
121 uint16_t surrogates[2];
122 Utf16::Encode(c, surrogates);
123 AddLeadSurrogate(surrogates[0]);
124 AddTrailSurrogate(surrogates[1]);
125 } else if (is_unicode() && Utf16::IsLeadSurrogate(c)) {
126 AddLeadSurrogate(c);
127 } else if (is_unicode() && Utf16::IsTrailSurrogate(c)) {
128 AddTrailSurrogate(c);
129 } else {
130 AddCharacter(static_cast<uint16_t>(c));
131 }
132}
133
135 // A lead or trail surrogate parsed via escape sequence will not
136 // pair up with any preceding lead or following trail surrogate.
137 FlushPendingSurrogate();
139 FlushPendingSurrogate();
140}
141
143 pending_empty_ = true;
144}
145
147 if (NeedsDesugaringForUnicode(cc)) {
148 // With /u, character class needs to be desugared, so it
149 // must be a standalone term instead of being part of a RegExpText.
150 AddTerm(cc);
151 } else {
152 AddAtom(cc);
153 }
154}
155
158 AddTerm(new (Z) RegExpCharacterClass(ranges, flags_));
159}
160
162 if (term->IsEmpty()) {
163 AddEmpty();
164 return;
165 }
166 if (term->IsTextElement()) {
167 FlushCharacters();
168 text_.Add(term);
169 } else {
170 FlushText();
171 terms_.Add(term);
172 }
173 LAST(ADD_ATOM);
174}
175
177 FlushText();
178 terms_.Add(term);
179 LAST(ADD_ATOM);
180}
181
183 FlushText();
184 terms_.Add(assert);
185 LAST(ADD_ASSERT);
186}
187
189 FlushTerms();
190}
191
192void RegExpBuilder::FlushTerms() {
193 FlushText();
194 intptr_t num_terms = terms_.length();
195 RegExpTree* alternative;
196 if (num_terms == 0) {
197 alternative = RegExpEmpty::GetInstance();
198 } else if (num_terms == 1) {
199 alternative = terms_.Last();
200 } else {
201 ZoneGrowableArray<RegExpTree*>* terms =
202 new (Z) ZoneGrowableArray<RegExpTree*>();
203 for (intptr_t i = 0; i < terms_.length(); i++) {
204 terms->Add(terms_[i]);
205 }
206 alternative = new (Z) RegExpAlternative(terms);
207 }
208 alternatives_.Add(alternative);
209 terms_.Clear();
210 LAST(ADD_NONE);
211}
212
213bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
214 if (!is_unicode()) return false;
215 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
216 // necessarily mean that we need to desugar. It's probably nicer to have a
217 // separate pass to figure out unicode desugarings.
218 if (ignore_case()) return true;
219 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
221
222 if (cc->is_negated()) {
223 auto negated_ranges =
224 new (Z) ZoneGrowableArray<CharacterRange>(ranges->length());
225 CharacterRange::Negate(ranges, negated_ranges);
226 ranges = negated_ranges;
227 }
228
229 for (int i = ranges->length() - 1; i >= 0; i--) {
230 uint32_t from = ranges->At(i).from();
231 uint32_t to = ranges->At(i).to();
232 // Check for non-BMP characters.
233 if (to >= Utf16::kMaxCodeUnit) return true;
234 // Check for lone surrogates.
235 if (from <= Utf16::kTrailSurrogateEnd && to >= Utf16::kLeadSurrogateStart) {
236 return true;
237 }
238 }
239 return false;
240}
241
242bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uint32_t c) {
243 if (is_unicode() && ignore_case()) {
244 icu::UnicodeSet set(c, c);
245 set.closeOver(USET_CASE_INSENSITIVE);
246 set.removeAllStrings();
247 return set.size() > 1;
248 }
249 return false;
250}
251
253 FlushTerms();
254 intptr_t num_alternatives = alternatives_.length();
255 if (num_alternatives == 0) {
257 }
258 if (num_alternatives == 1) {
259 return alternatives_.Last();
260 }
261 ZoneGrowableArray<RegExpTree*>* alternatives =
263 for (intptr_t i = 0; i < alternatives_.length(); i++) {
264 alternatives->Add(alternatives_[i]);
265 }
266 return new (Z) RegExpDisjunction(alternatives);
267}
268
270 intptr_t min,
271 intptr_t max,
272 RegExpQuantifier::QuantifierType quantifier_type) {
273 if (pending_empty_) {
274 pending_empty_ = false;
275 return true;
276 }
277 RegExpTree* atom;
278 if (characters_ != nullptr) {
279 DEBUG_ASSERT(last_added_ == ADD_CHAR);
280 // Last atom was character.
281
282 ZoneGrowableArray<uint16_t>* char_vector =
284 char_vector->AddArray(*characters_);
285 intptr_t num_chars = char_vector->length();
286 if (num_chars > 1) {
289 for (intptr_t i = 0; i < num_chars - 1; i++) {
290 prefix->Add(char_vector->At(i));
291 }
292 text_.Add(new (Z) RegExpAtom(prefix, flags_));
294 tail->Add(char_vector->At(num_chars - 1));
295 char_vector = tail;
296 }
297 characters_ = nullptr;
298 atom = new (Z) RegExpAtom(char_vector, flags_);
299 FlushText();
300 } else if (text_.length() > 0) {
301 DEBUG_ASSERT(last_added_ == ADD_ATOM);
302 atom = text_.RemoveLast();
303 FlushText();
304 } else if (terms_.length() > 0) {
305 DEBUG_ASSERT(last_added_ == ADD_ATOM);
306 atom = terms_.RemoveLast();
307 if (auto lookaround = atom->AsLookaround()) {
308 // With /u, lookarounds are not quantifiable.
309 if (is_unicode()) return false;
310 // Lookbehinds are not quantifiable.
311 if (lookaround->type() == RegExpLookaround::LOOKBEHIND) {
312 return false;
313 }
314 }
315 if (atom->max_match() == 0) {
316 // Guaranteed to only match an empty string.
317 LAST(ADD_TERM);
318 if (min == 0) {
319 return true;
320 }
321 terms_.Add(atom);
322 return true;
323 }
324 } else {
325 // Only call immediately after adding an atom or character!
326 UNREACHABLE();
327 }
328 terms_.Add(new (Z) RegExpQuantifier(min, max, quantifier_type, atom));
329 LAST(ADD_TERM);
330 return true;
331}
332
333// ----------------------------------------------------------------------------
334// Implementation of Parser
335
337 : zone_(Thread::Current()->zone()),
338 captures_(nullptr),
339 named_captures_(nullptr),
340 named_back_references_(nullptr),
341 in_(in),
342 current_(kEndMarker),
343 next_pos_(0),
344 captures_started_(0),
345 capture_count_(0),
346 has_more_(true),
347 top_level_flags_(flags),
348 simple_(false),
349 contains_anchor_(false),
350 is_scanned_for_captures_(false),
351 has_named_captures_(false) {
352 Advance();
353}
354
355inline uint32_t RegExpParser::ReadNext(bool update_position) {
356 intptr_t position = next_pos_;
357 const uint16_t c0 = in().CharAt(position);
358 uint32_t c = c0;
359 position++;
360 if (is_unicode() && position < in().Length() && Utf16::IsLeadSurrogate(c0)) {
361 const uint16_t c1 = in().CharAt(position);
362 if (Utf16::IsTrailSurrogate(c1)) {
363 c = Utf16::Decode(c0, c1);
364 position++;
365 }
366 }
367 if (update_position) next_pos_ = position;
368 return c;
369}
370
371uint32_t RegExpParser::Next() {
372 if (has_next()) {
373 return ReadNext(false);
374 } else {
375 return kEndMarker;
376 }
377}
378
380 if (has_next()) {
381 current_ = ReadNext(true);
382 } else {
383 current_ = kEndMarker;
384 // Advance so that position() points to 1 after the last character. This is
385 // important so that Reset() to this position works correctly.
386 next_pos_ = in().Length() + 1;
387 has_more_ = false;
388 }
389}
390
391void RegExpParser::Reset(intptr_t pos) {
392 next_pos_ = pos;
393 has_more_ = (pos < in().Length());
394 Advance();
395}
396
397void RegExpParser::Advance(intptr_t dist) {
398 next_pos_ += dist - 1;
399 Advance();
400}
401
403 return simple_;
404}
405
407 switch (c) {
408 case '^':
409 case '$':
410 case '\\':
411 case '.':
412 case '*':
413 case '+':
414 case '?':
415 case '(':
416 case ')':
417 case '[':
418 case ']':
419 case '{':
420 case '}':
421 case '|':
422 case '/':
423 return true;
424 default:
425 break;
426 }
427 return false;
428}
429
431 // Zip to the end to make sure the no more input is read.
432 current_ = kEndMarker;
433 next_pos_ = in().Length();
434
435 // Throw a FormatException on parsing failures.
437 String& str = String::Handle();
438 args ^= Array::New(3);
439 str ^= String::New(message);
440 args.SetAt(0, str);
441 args.SetAt(1, Symbols::Blank());
442 args.SetAt(2, in());
443 str ^= String::ConcatAll(args);
444 args ^= Array::New(1);
445 args.SetAt(0, str);
447 UNREACHABLE();
448}
449
450// Pattern ::
451// Disjunction
454 PatchNamedBackReferences();
455 ASSERT(!has_more());
456 // If the result of parsing is a literal string atom, and it has the
457 // same length as the input, then the atom is identical to the input.
458 if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {
459 simple_ = true;
460 }
461 return result;
462}
463
464// Used for error messages where we would have fallen back on treating an
465// escape as the identity escape, but we are in Unicode mode.
466static const char* kUnicodeIdentity =
467 "Invalid identity escape in Unicode pattern";
468
469// Disjunction ::
470// Alternative
471// Alternative | Disjunction
472// Alternative ::
473// [empty]
474// Term Alternative
475// Term ::
476// Assertion
477// Atom
478// Atom Quantifier
480 // Used to store current state while parsing subexpressions.
481 RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
482 0, nullptr, top_level_flags_, Z);
483 RegExpParserState* stored_state = &initial_state;
484 // Cache the builder in a local variable for quick access.
485 RegExpBuilder* builder = initial_state.builder();
486 while (true) {
487 switch (current()) {
488 case kEndMarker:
489 if (stored_state->IsSubexpression()) {
490 // Inside a parenthesized group when hitting end of input.
491 ReportError("Unterminated group");
492 UNREACHABLE();
493 }
494 ASSERT(INITIAL == stored_state->group_type());
495 // Parsing completed successfully.
496 return builder->ToRegExp();
497 case ')': {
498 if (!stored_state->IsSubexpression()) {
499 ReportError("Unmatched ')'");
500 UNREACHABLE();
501 }
502 ASSERT(INITIAL != stored_state->group_type());
503
504 Advance();
505 // End disjunction parsing and convert builder content to new single
506 // regexp atom.
507 RegExpTree* body = builder->ToRegExp();
508
509 intptr_t end_capture_index = captures_started();
510
511 intptr_t capture_index = stored_state->capture_index();
512 SubexpressionType group_type = stored_state->group_type();
513
514 // Build result of subexpression.
515 if (group_type == CAPTURE) {
516 if (stored_state->IsNamedCapture()) {
517 CreateNamedCaptureAtIndex(stored_state->capture_name(),
518 capture_index);
519 }
520 RegExpCapture* capture = GetCapture(capture_index);
521 capture->set_body(body);
522 body = capture;
523 } else if (group_type != GROUPING) {
524 ASSERT(group_type == POSITIVE_LOOKAROUND ||
525 group_type == NEGATIVE_LOOKAROUND);
526 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
527 body = new (Z) RegExpLookaround(
528 body, is_positive, end_capture_index - capture_index,
529 capture_index, stored_state->lookaround_type());
530 }
531
532 // Restore previous state.
533 stored_state = stored_state->previous_state();
534 builder = stored_state->builder();
535
536 builder->AddAtom(body);
537 // For compatibility with JSC and ES3, we allow quantifiers after
538 // lookaheads, and break in all cases.
539 break;
540 }
541 case '|': {
542 Advance();
543 builder->NewAlternative();
544 continue;
545 }
546 case '*':
547 case '+':
548 case '?':
549 ReportError("Nothing to repeat");
550 UNREACHABLE();
551 case '^': {
552 Advance();
553 if (builder->is_multi_line()) {
554 builder->AddAssertion(new (Z) RegExpAssertion(
556 } else {
557 builder->AddAssertion(new (Z) RegExpAssertion(
560 }
561 continue;
562 }
563 case '$': {
564 Advance();
565 RegExpAssertion::AssertionType assertion_type =
566 builder->is_multi_line() ? RegExpAssertion::END_OF_LINE
568 builder->AddAssertion(
569 new (Z) RegExpAssertion(assertion_type, builder->flags()));
570 continue;
571 }
572 case '.': {
573 Advance();
574 auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
575 if (builder->is_dot_all()) {
576 // Everything.
578 '*', ranges,
579 /*add_unicode_case_equivalents=*/false);
580 } else {
581 // everything except \x0a, \x0d, \u2028 and \u2029
583 '.', ranges,
584 /*add_unicode_case_equivalents=*/false);
585 }
587 new (Z) RegExpCharacterClass(ranges, builder->flags());
588 builder->AddCharacterClass(cc);
589 break;
590 }
591 case '(': {
592 stored_state = ParseOpenParenthesis(stored_state);
593 builder = stored_state->builder();
594 continue;
595 }
596 case '[': {
598 builder->AddCharacterClass(atom->AsCharacterClass());
599 break;
600 }
601 // Atom ::
602 // \ AtomEscape
603 case '\\':
604 switch (Next()) {
605 case kEndMarker:
606 ReportError("\\ at end of pattern");
607 UNREACHABLE();
608 case 'b':
609 Advance(2);
610 builder->AddAssertion(new (Z) RegExpAssertion(
612 continue;
613 case 'B':
614 Advance(2);
615 builder->AddAssertion(new (Z) RegExpAssertion(
617 continue;
618 // AtomEscape ::
619 // CharacterClassEscape
620 //
621 // CharacterClassEscape :: one of
622 // d D s S w W
623 case 'd':
624 case 'D':
625 case 's':
626 case 'S':
627 case 'w':
628 case 'W': {
629 uint32_t c = Next();
630 Advance(2);
631 auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
633 c, ranges, is_unicode() && builder->ignore_case());
635 new (Z) RegExpCharacterClass(ranges, builder->flags());
636 builder->AddCharacterClass(cc);
637 break;
638 }
639 case 'p':
640 case 'P': {
641 uint32_t p = Next();
642 Advance(2);
643
644 if (is_unicode()) {
645 auto name_1 = new (Z) ZoneGrowableArray<char>();
646 auto name_2 = new (Z) ZoneGrowableArray<char>();
647 auto ranges = new (Z) ZoneGrowableArray<CharacterRange>(2);
648 if (ParsePropertyClassName(name_1, name_2)) {
649 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
651 new (Z) RegExpCharacterClass(ranges, builder->flags());
652 builder->AddCharacterClass(cc);
653 break;
654 }
655 }
656 ReportError("Invalid property name");
657 UNREACHABLE();
658 } else {
659 builder->AddCharacter(p);
660 }
661 break;
662 }
663 case '1':
664 case '2':
665 case '3':
666 case '4':
667 case '5':
668 case '6':
669 case '7':
670 case '8':
671 case '9': {
672 intptr_t index = 0;
673 if (ParseBackReferenceIndex(&index)) {
674 if (stored_state->IsInsideCaptureGroup(index)) {
675 // The back reference is inside the capture group it refers to.
676 // Nothing can possibly have been captured yet, so we use empty
677 // instead. This ensures that, when checking a back reference,
678 // the capture registers of the referenced capture are either
679 // both set or both cleared.
680 builder->AddEmpty();
681 } else {
682 RegExpCapture* capture = GetCapture(index);
683 RegExpTree* atom =
684 new (Z) RegExpBackReference(capture, builder->flags());
685 builder->AddAtom(atom);
686 }
687 break;
688 }
689 // With /u, no identity escapes except for syntax characters are
690 // allowed. Otherwise, all identity escapes are allowed.
691 if (is_unicode()) {
693 UNREACHABLE();
694 }
695 uint32_t first_digit = Next();
696 if (first_digit == '8' || first_digit == '9') {
697 builder->AddCharacter(first_digit);
698 Advance(2);
699 break;
700 }
701 }
703 case '0': {
704 Advance();
705 if (is_unicode() && Next() >= '0' && Next() <= '9') {
706 // With /u, decimal escape with leading 0 are not parsed as octal.
707 ReportError("Invalid decimal escape");
708 UNREACHABLE();
709 }
710 uint32_t octal = ParseOctalLiteral();
711 builder->AddCharacter(octal);
712 break;
713 }
714 // ControlEscape :: one of
715 // f n r t v
716 case 'f':
717 Advance(2);
718 builder->AddCharacter('\f');
719 break;
720 case 'n':
721 Advance(2);
722 builder->AddCharacter('\n');
723 break;
724 case 'r':
725 Advance(2);
726 builder->AddCharacter('\r');
727 break;
728 case 't':
729 Advance(2);
730 builder->AddCharacter('\t');
731 break;
732 case 'v':
733 Advance(2);
734 builder->AddCharacter('\v');
735 break;
736 case 'c': {
737 Advance();
738 uint32_t controlLetter = Next();
739 // Special case if it is an ASCII letter.
740 // Convert lower case letters to uppercase.
741 uint32_t letter = controlLetter & ~('a' ^ 'A');
742 if (letter < 'A' || 'Z' < letter) {
743 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
744 // This is outside the specification. We match JSC in
745 // reading the backslash as a literal character instead
746 // of as starting an escape.
747 if (is_unicode()) {
748 // With /u, invalid escapes are not treated as identity escapes.
750 UNREACHABLE();
751 }
752 builder->AddCharacter('\\');
753 } else {
754 Advance(2);
755 builder->AddCharacter(controlLetter & 0x1f);
756 }
757 break;
758 }
759 case 'x': {
760 Advance(2);
761 uint32_t value;
762 if (ParseHexEscape(2, &value)) {
763 builder->AddCharacter(value);
764 } else if (!is_unicode()) {
765 builder->AddCharacter('x');
766 } else {
767 // With /u, invalid escapes are not treated as identity escapes.
769 UNREACHABLE();
770 }
771 break;
772 }
773 case 'u': {
774 Advance(2);
775 uint32_t value;
777 builder->AddEscapedUnicodeCharacter(value);
778 } else if (!is_unicode()) {
779 builder->AddCharacter('u');
780 } else {
781 // With /u, invalid escapes are not treated as identity escapes.
783 UNREACHABLE();
784 }
785 break;
786 }
787 case 'k':
788 // Either an identity escape or a named back-reference. The two
789 // interpretations are mutually exclusive: '\k' is interpreted as
790 // an identity escape for non-Unicode patterns without named
791 // capture groups, and as the beginning of a named back-reference
792 // in all other cases.
793 if (is_unicode() || HasNamedCaptures()) {
794 Advance(2);
795 ParseNamedBackReference(builder, stored_state);
796 break;
797 }
799 default:
800 Advance();
801 // With the unicode flag, no identity escapes except for syntax
802 // characters are allowed. Otherwise, all identity escapes are
803 // allowed.
804 if (!is_unicode() || IsSyntaxCharacterOrSlash(current())) {
805 builder->AddCharacter(current());
806 Advance();
807 } else {
809 UNREACHABLE();
810 }
811 break;
812 }
813 break;
814 case '{': {
815 intptr_t dummy;
816 if (ParseIntervalQuantifier(&dummy, &dummy)) {
817 ReportError("Nothing to repeat");
818 UNREACHABLE();
819 }
820 }
822 case '}':
823 case ']':
824 if (is_unicode()) {
825 ReportError("Lone quantifier brackets");
826 UNREACHABLE();
827 }
829 default:
830 builder->AddUnicodeCharacter(current());
831 Advance();
832 break;
833 } // end switch(current())
834
835 intptr_t min;
836 intptr_t max;
837 switch (current()) {
838 // QuantifierPrefix ::
839 // *
840 // +
841 // ?
842 // {
843 case '*':
844 min = 0;
846 Advance();
847 break;
848 case '+':
849 min = 1;
851 Advance();
852 break;
853 case '?':
854 min = 0;
855 max = 1;
856 Advance();
857 break;
858 case '{':
860 if (max < min) {
861 ReportError("numbers out of order in {} quantifier.");
862 UNREACHABLE();
863 }
864 break;
865 } else {
866 continue;
867 }
868 default:
869 continue;
870 }
872 if (current() == '?') {
873 quantifier_type = RegExpQuantifier::NON_GREEDY;
874 Advance();
875 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
876 // FLAG_regexp_possessive_quantifier is a debug-only flag.
877 quantifier_type = RegExpQuantifier::POSSESSIVE;
878 Advance();
879 }
880 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
881 ReportError("invalid quantifier.");
882 UNREACHABLE();
883 }
884 }
885}
886
887#ifdef DEBUG
888// Currently only used in an ASSERT.
889static bool IsSpecialClassEscape(uint32_t c) {
890 switch (c) {
891 case 'd':
892 case 'D':
893 case 's':
894 case 'S':
895 case 'w':
896 case 'W':
897 return true;
898 default:
899 return false;
900 }
901}
902#endif
903
904RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
905 RegExpParserState* state) {
906 RegExpLookaround::Type lookaround_type = state->lookaround_type();
907 bool is_named_capture = false;
908 const RegExpCaptureName* capture_name = nullptr;
909 SubexpressionType subexpr_type = CAPTURE;
910 Advance();
911 if (current() == '?') {
912 switch (Next()) {
913 case ':':
914 Advance(2);
915 subexpr_type = GROUPING;
916 break;
917 case '=':
918 Advance(2);
919 lookaround_type = RegExpLookaround::LOOKAHEAD;
920 subexpr_type = POSITIVE_LOOKAROUND;
921 break;
922 case '!':
923 Advance(2);
924 lookaround_type = RegExpLookaround::LOOKAHEAD;
925 subexpr_type = NEGATIVE_LOOKAROUND;
926 break;
927 case '<':
928 Advance();
929 if (Next() == '=') {
930 Advance(2);
931 lookaround_type = RegExpLookaround::LOOKBEHIND;
932 subexpr_type = POSITIVE_LOOKAROUND;
933 break;
934 } else if (Next() == '!') {
935 Advance(2);
936 lookaround_type = RegExpLookaround::LOOKBEHIND;
937 subexpr_type = NEGATIVE_LOOKAROUND;
938 break;
939 }
940 is_named_capture = true;
941 has_named_captures_ = true;
942 Advance();
943 break;
944 default:
945 ReportError("Invalid group");
946 UNREACHABLE();
947 }
948 }
949
950 if (subexpr_type == CAPTURE) {
951 if (captures_started_ >= kMaxCaptures) {
952 ReportError("Too many captures");
953 UNREACHABLE();
954 }
955 captures_started_++;
956
957 if (is_named_capture) {
958 capture_name = ParseCaptureGroupName();
959 }
960 }
961 // Store current state and begin new disjunction parsing.
962 return new (Z)
963 RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
964 capture_name, state->builder()->flags(), Z);
965}
966
967// In order to know whether an escape is a backreference or not we have to scan
968// the entire regexp and find the number of capturing parentheses. However we
969// don't want to scan the regexp twice unless it is necessary. This mini-parser
970// is called when needed. It can see the difference between capturing and
971// noncapturing parentheses and can skip character classes and backslash-escaped
972// characters.
973void RegExpParser::ScanForCaptures() {
974 ASSERT(!is_scanned_for_captures_);
975 const intptr_t saved_position = position();
976 // Start with captures started previous to current position
977 intptr_t capture_count = captures_started();
978 // Add count of captures after this position.
979 uintptr_t n;
980 while ((n = current()) != kEndMarker) {
981 Advance();
982 switch (n) {
983 case '\\':
984 Advance();
985 break;
986 case '[': {
987 uintptr_t c;
988 while ((c = current()) != kEndMarker) {
989 Advance();
990 if (c == '\\') {
991 Advance();
992 } else {
993 if (c == ']') break;
994 }
995 }
996 break;
997 }
998 case '(':
999 // At this point we could be in
1000 // * a non-capturing group '(:',
1001 // * a lookbehind assertion '(?<=' '(?<!'
1002 // * or a named capture '(?<'.
1003 //
1004 // Of these, only named captures are capturing groups.
1005 if (current() == '?') {
1006 Advance();
1007 if (current() != '<') break;
1008
1009 Advance();
1010 if (current() == '=' || current() == '!') break;
1011
1012 // Found a possible named capture. It could turn out to be a syntax
1013 // error (e.g. an unterminated or invalid name), but that distinction
1014 // does not matter for our purposes.
1015 has_named_captures_ = true;
1016 }
1017 capture_count++;
1018 break;
1019 }
1020 }
1021 capture_count_ = capture_count;
1022 is_scanned_for_captures_ = true;
1023 Reset(saved_position);
1024}
1025
1026bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {
1027 ASSERT('\\' == current());
1028 ASSERT('1' <= Next() && Next() <= '9');
1029 // Try to parse a decimal literal that is no greater than the total number
1030 // of left capturing parentheses in the input.
1031 intptr_t start = position();
1032 intptr_t value = Next() - '0';
1033 Advance(2);
1034 while (true) {
1035 uint32_t c = current();
1036 if (Utils::IsDecimalDigit(c)) {
1037 value = 10 * value + (c - '0');
1038 if (value > kMaxCaptures) {
1039 Reset(start);
1040 return false;
1041 }
1042 Advance();
1043 } else {
1044 break;
1045 }
1046 }
1047 if (value > captures_started()) {
1048 if (!is_scanned_for_captures_) ScanForCaptures();
1049 if (value > capture_count_) {
1050 Reset(start);
1051 return false;
1052 }
1053 }
1054 *index_out = value;
1055 return true;
1056}
1057
1058namespace {
1059
1060static inline constexpr bool IsAsciiIdentifierPart(uint32_t ch) {
1061 return Utils::IsAlphaNumeric(ch) || ch == '_' || ch == '$';
1062}
1063
1064// ES#sec-names-and-keywords Names and Keywords
1065// UnicodeIDStart, '$', '_' and '\'
1066static bool IsIdentifierStartSlow(uint32_t c) {
1067 // cannot use u_isIDStart because it does not work for
1068 // Other_ID_Start characters.
1069 return u_hasBinaryProperty(c, UCHAR_ID_START) ||
1070 (c < 0x60 && (c == '$' || c == '\\' || c == '_'));
1071}
1072
1073// ES#sec-names-and-keywords Names and Keywords
1074// UnicodeIDContinue, '$', '_', '\', ZWJ, and ZWNJ
1075static bool IsIdentifierPartSlow(uint32_t c) {
1076 const uint32_t kZeroWidthNonJoiner = 0x200C;
1077 const uint32_t kZeroWidthJoiner = 0x200D;
1078 // Can't use u_isIDPart because it does not work for
1079 // Other_ID_Continue characters.
1080 return u_hasBinaryProperty(c, UCHAR_ID_CONTINUE) ||
1081 (c < 0x60 && (c == '$' || c == '\\' || c == '_')) ||
1082 c == kZeroWidthNonJoiner || c == kZeroWidthJoiner;
1083}
1084
1085static inline bool IsIdentifierStart(uint32_t c) {
1086 if (c > 127) return IsIdentifierStartSlow(c);
1087 return IsAsciiIdentifierPart(c) && !Utils::IsDecimalDigit(c);
1088}
1089
1090static inline bool IsIdentifierPart(uint32_t c) {
1091 if (c > 127) return IsIdentifierPartSlow(c);
1092 return IsAsciiIdentifierPart(c);
1093}
1094
1095static bool IsSameName(const RegExpCaptureName* name1,
1096 const RegExpCaptureName* name2) {
1097 if (name1->length() != name2->length()) return false;
1098 for (intptr_t i = 0; i < name1->length(); i++) {
1099 if (name1->At(i) != name2->At(i)) return false;
1100 }
1101 return true;
1102}
1103
1104} // end namespace
1105
1106static void PushCodeUnit(RegExpCaptureName* v, uint32_t code_unit) {
1107 if (code_unit <= Utf16::kMaxCodeUnit) {
1108 v->Add(code_unit);
1109 } else {
1110 uint16_t units[2];
1111 Utf16::Encode(code_unit, units);
1112 v->Add(units[0]);
1113 v->Add(units[1]);
1114 }
1115}
1116
1117const RegExpCaptureName* RegExpParser::ParseCaptureGroupName() {
1118 auto name = new (Z) RegExpCaptureName();
1119
1120 bool at_start = true;
1121 while (true) {
1122 uint32_t c = current();
1123 Advance();
1124
1125 // Convert unicode escapes.
1126 if (c == '\\' && current() == 'u') {
1127 Advance();
1128 if (!ParseUnicodeEscape(&c)) {
1129 ReportError("Invalid Unicode escape sequence");
1130 UNREACHABLE();
1131 }
1132 }
1133
1134 // The backslash char is misclassified as both ID_Start and ID_Continue.
1135 if (c == '\\') {
1136 ReportError("Invalid capture group name");
1137 UNREACHABLE();
1138 }
1139
1140 if (at_start) {
1141 if (!IsIdentifierStart(c)) {
1142 ReportError("Invalid capture group name");
1143 UNREACHABLE();
1144 }
1145 PushCodeUnit(name, c);
1146 at_start = false;
1147 } else {
1148 if (c == '>') {
1149 break;
1150 } else if (IsIdentifierPart(c)) {
1151 PushCodeUnit(name, c);
1152 } else {
1153 ReportError("Invalid capture group name");
1154 UNREACHABLE();
1155 }
1156 }
1157 }
1158
1159 return name;
1160}
1161
1162intptr_t RegExpParser::GetNamedCaptureIndex(const RegExpCaptureName* name) {
1163 for (const auto& capture : *named_captures_) {
1164 if (IsSameName(name, capture->name())) return capture->index();
1165 }
1166 return -1;
1167}
1168
1169void RegExpParser::CreateNamedCaptureAtIndex(const RegExpCaptureName* name,
1170 intptr_t index) {
1171 ASSERT(0 < index && index <= captures_started_);
1172 ASSERT(name != nullptr);
1173
1174 if (named_captures_ == nullptr) {
1175 named_captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(1);
1176 } else {
1177 // Check for duplicates and bail if we find any. Currently O(n^2).
1178 if (GetNamedCaptureIndex(name) >= 0) {
1179 ReportError("Duplicate capture group name");
1180 UNREACHABLE();
1181 }
1182 }
1183
1184 RegExpCapture* capture = GetCapture(index);
1185 ASSERT(capture->name() == nullptr);
1186
1187 capture->set_name(name);
1188 named_captures_->Add(capture);
1189}
1190
1191bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
1192 RegExpParserState* state) {
1193 // The parser is assumed to be on the '<' in \k<name>.
1194 if (current() != '<') {
1195 ReportError("Invalid named reference");
1196 UNREACHABLE();
1197 }
1198
1199 Advance();
1200 const RegExpCaptureName* name = ParseCaptureGroupName();
1201 if (name == nullptr) {
1202 return false;
1203 }
1204
1205 if (state->IsInsideCaptureGroup(name)) {
1206 builder->AddEmpty();
1207 } else {
1208 RegExpBackReference* atom = new (Z) RegExpBackReference(builder->flags());
1209 atom->set_name(name);
1210
1211 builder->AddAtom(atom);
1212
1213 if (named_back_references_ == nullptr) {
1214 named_back_references_ =
1215 new (Z) ZoneGrowableArray<RegExpBackReference*>(1);
1216 }
1217 named_back_references_->Add(atom);
1218 }
1219
1220 return true;
1221}
1222
1223void RegExpParser::PatchNamedBackReferences() {
1224 if (named_back_references_ == nullptr) return;
1225
1226 if (named_captures_ == nullptr) {
1227 ReportError("Invalid named capture referenced");
1228 return;
1229 }
1230
1231 // Look up and patch the actual capture for each named back reference.
1232 // Currently O(n^2), optimize if necessary.
1233 for (intptr_t i = 0; i < named_back_references_->length(); i++) {
1234 RegExpBackReference* ref = named_back_references_->At(i);
1235 intptr_t index = GetNamedCaptureIndex(ref->name());
1236
1237 if (index < 0) {
1238 ReportError("Invalid named capture referenced");
1239 UNREACHABLE();
1240 }
1241 ref->set_capture(GetCapture(index));
1242 }
1243}
1244
1245RegExpCapture* RegExpParser::GetCapture(intptr_t index) {
1246 // The index for the capture groups are one-based. Its index in the list is
1247 // zero-based.
1248 const intptr_t know_captures =
1249 is_scanned_for_captures_ ? capture_count_ : captures_started_;
1250 ASSERT(index <= know_captures);
1251 if (captures_ == nullptr) {
1252 captures_ = new (Z) ZoneGrowableArray<RegExpCapture*>(know_captures);
1253 }
1254 while (captures_->length() < know_captures) {
1255 captures_->Add(new (Z) RegExpCapture(captures_->length() + 1));
1256 }
1257 return captures_->At(index - 1);
1258}
1259
1260ArrayPtr RegExpParser::CreateCaptureNameMap() {
1261 if (named_captures_ == nullptr || named_captures_->is_empty()) {
1262 return Array::null();
1263 }
1264
1265 const intptr_t len = named_captures_->length() * 2;
1266
1267 const Array& array = Array::Handle(Array::New(len));
1268
1269 auto& name = String::Handle();
1270 auto& smi = Smi::Handle();
1271 for (intptr_t i = 0; i < named_captures_->length(); i++) {
1272 RegExpCapture* capture = named_captures_->At(i);
1273 name =
1274 String::FromUTF16(capture->name()->data(), capture->name()->length());
1275 smi = Smi::New(capture->index());
1276 array.SetAt(i * 2, name);
1277 array.SetAt(i * 2 + 1, smi);
1278 }
1279
1280 return array.ptr();
1281}
1282
1283bool RegExpParser::HasNamedCaptures() {
1284 if (has_named_captures_ || is_scanned_for_captures_) {
1285 return has_named_captures_;
1286 }
1287
1288 ScanForCaptures();
1289 ASSERT(is_scanned_for_captures_);
1290 return has_named_captures_;
1291}
1292
1293bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(intptr_t index) {
1294 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1295 if (s->group_type() != CAPTURE) continue;
1296 // Return true if we found the matching capture index.
1297 if (index == s->capture_index()) return true;
1298 // Abort if index is larger than what has been parsed up till this state.
1299 if (index > s->capture_index()) return false;
1300 }
1301 return false;
1302}
1303
1304bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1305 const RegExpCaptureName* name) {
1306 ASSERT(name != nullptr);
1307 for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1308 if (s->capture_name() == nullptr) continue;
1309 if (IsSameName(s->capture_name(), name)) return true;
1310 }
1311 return false;
1312}
1313
1314// QuantifierPrefix ::
1315// { DecimalDigits }
1316// { DecimalDigits , }
1317// { DecimalDigits , DecimalDigits }
1318//
1319// Returns true if parsing succeeds, and set the min_out and max_out
1320// values. Values are truncated to RegExpTree::kInfinity if they overflow.
1322 intptr_t* max_out) {
1323 ASSERT(current() == '{');
1324 intptr_t start = position();
1325 Advance();
1326 intptr_t min = 0;
1327 if (!Utils::IsDecimalDigit(current())) {
1328 Reset(start);
1329 return false;
1330 }
1331 while (Utils::IsDecimalDigit(current())) {
1332 intptr_t next = current() - '0';
1333 if (min > (RegExpTree::kInfinity - next) / 10) {
1334 // Overflow. Skip past remaining decimal digits and return -1.
1335 do {
1336 Advance();
1337 } while (Utils::IsDecimalDigit(current()));
1339 break;
1340 }
1341 min = 10 * min + next;
1342 Advance();
1343 }
1344 intptr_t max = 0;
1345 if (current() == '}') {
1346 max = min;
1347 Advance();
1348 } else if (current() == ',') {
1349 Advance();
1350 if (current() == '}') {
1352 Advance();
1353 } else {
1354 while (Utils::IsDecimalDigit(current())) {
1355 intptr_t next = current() - '0';
1356 if (max > (RegExpTree::kInfinity - next) / 10) {
1357 do {
1358 Advance();
1359 } while (Utils::IsDecimalDigit(current()));
1361 break;
1362 }
1363 max = 10 * max + next;
1364 Advance();
1365 }
1366 if (current() != '}') {
1367 Reset(start);
1368 return false;
1369 }
1370 Advance();
1371 }
1372 } else {
1373 Reset(start);
1374 return false;
1375 }
1376 *min_out = min;
1377 *max_out = max;
1378 return true;
1379}
1380
1382 ASSERT(('0' <= current() && current() <= '7') || current() == kEndMarker);
1383 // For compatibility with some other browsers (not all), we parse
1384 // up to three octal digits with a value below 256.
1385 uint32_t value = current() - '0';
1386 Advance();
1387 if ('0' <= current() && current() <= '7') {
1388 value = value * 8 + current() - '0';
1389 Advance();
1390 if (value < 32 && '0' <= current() && current() <= '7') {
1391 value = value * 8 + current() - '0';
1392 Advance();
1393 }
1394 }
1395 return value;
1396}
1397
1398// Returns the value (0 .. 15) of a hexadecimal character c.
1399// If c is not a legal hexadecimal character, returns a value < 0.
1400static inline intptr_t HexValue(uint32_t c) {
1401 c -= '0';
1402 if (static_cast<unsigned>(c) <= 9) return c;
1403 c = (c | 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.
1404 if (static_cast<unsigned>(c) <= 5) return c + 10;
1405 return -1;
1406}
1407
1408bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) {
1409 intptr_t start = position();
1410 uint32_t val = 0;
1411 bool done = false;
1412 for (intptr_t i = 0; !done; i++) {
1413 uint32_t c = current();
1414 intptr_t d = HexValue(c);
1415 if (d < 0) {
1416 Reset(start);
1417 return false;
1418 }
1419 val = val * 16 + d;
1420 Advance();
1421 if (i == length - 1) {
1422 done = true;
1423 }
1424 }
1425 *value = val;
1426 return true;
1427}
1428
1429// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1431 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1432 // allowed). In the latter case, the number of hex digits between { } is
1433 // arbitrary. \ and u have already been read.
1434 if (current() == '{' && is_unicode()) {
1435 int start = position();
1436 Advance();
1438 if (current() == '}') {
1439 Advance();
1440 return true;
1441 }
1442 }
1443 Reset(start);
1444 return false;
1445 }
1446 // \u but no {, or \u{...} escapes not allowed.
1447 bool result = ParseHexEscape(4, value);
1449 current() == '\\') {
1450 // Attempt to read trail surrogate.
1451 int start = position();
1452 if (Next() == 'u') {
1453 Advance(2);
1454 uint32_t trail;
1455 if (ParseHexEscape(4, &trail) && Utf16::IsTrailSurrogate(trail)) {
1456 *value = Utf16::Decode(static_cast<uint16_t>(*value),
1457 static_cast<uint16_t>(trail));
1458 return true;
1459 }
1460 }
1461 Reset(start);
1462 }
1463 return result;
1464}
1465
1466namespace {
1467
1468bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1469 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1470 if (short_name != nullptr && strcmp(property_name, short_name) == 0) {
1471 return true;
1472 }
1473 for (int i = 0;; i++) {
1474 const char* long_name = u_getPropertyName(
1475 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1476 if (long_name == nullptr) break;
1477 if (strcmp(property_name, long_name) == 0) return true;
1478 }
1479 return false;
1480}
1481
1482bool IsExactPropertyValueAlias(const char* property_value_name,
1483 UProperty property,
1484 int32_t property_value) {
1485 const char* short_name =
1486 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1487 if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1488 return true;
1489 }
1490 for (int i = 0;; i++) {
1491 const char* long_name = u_getPropertyValueName(
1492 property, property_value,
1493 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1494 if (long_name == nullptr) break;
1495 if (strcmp(property_value_name, long_name) == 0) return true;
1496 }
1497 return false;
1498}
1499
1500bool LookupPropertyValueName(UProperty property,
1501 const char* property_value_name,
1502 bool negate,
1503 ZoneGrowableArray<CharacterRange>* result) {
1504 UProperty property_for_lookup = property;
1505 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1506 // For the property Script_Extensions, we have to do the property value
1507 // name lookup as if the property is Script.
1508 property_for_lookup = UCHAR_SCRIPT;
1509 }
1510 int32_t property_value =
1511 u_getPropertyValueEnum(property_for_lookup, property_value_name);
1512 if (property_value == UCHAR_INVALID_CODE) return false;
1513
1514 // We require the property name to match exactly to one of the property value
1515 // aliases. However, u_getPropertyValueEnum uses loose matching.
1516 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1517 property_value)) {
1518 return false;
1519 }
1520
1521 UErrorCode ec = U_ZERO_ERROR;
1522 icu::UnicodeSet set;
1523 set.applyIntPropertyValue(property, property_value, ec);
1524 bool success = ec == U_ZERO_ERROR && (set.isEmpty() == 0);
1525
1526 if (success) {
1527 set.removeAllStrings();
1528 if (negate) set.complement();
1529 for (int i = 0; i < set.getRangeCount(); i++) {
1530 result->Add(
1531 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
1532 }
1533 }
1534 return success;
1535}
1536
1537template <size_t N>
1538inline bool NameEquals(const char* name, const char (&literal)[N]) {
1539 return strncmp(name, literal, N + 1) == 0;
1540}
1541
1542bool LookupSpecialPropertyValueName(const char* name,
1543 ZoneGrowableArray<CharacterRange>* result,
1544 bool negate) {
1545 if (NameEquals(name, "Any")) {
1546 if (negate) {
1547 // Leave the list of character ranges empty, since the negation of 'Any'
1548 // is the empty set.
1549 } else {
1551 }
1552 } else if (NameEquals(name, "ASCII")) {
1554 : CharacterRange::Range(0x0, 0x7F));
1555 } else if (NameEquals(name, "Assigned")) {
1556 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1557 !negate, result);
1558 } else {
1559 return false;
1560 }
1561 return true;
1562}
1563
1564// Explicitly list supported binary properties. The spec forbids supporting
1565// properties outside of this set to ensure interoperability.
1566bool IsSupportedBinaryProperty(UProperty property) {
1567 switch (property) {
1568 case UCHAR_ALPHABETIC:
1569 // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1570 // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1571 case UCHAR_ASCII_HEX_DIGIT:
1572 // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1573 case UCHAR_BIDI_CONTROL:
1574 case UCHAR_BIDI_MIRRORED:
1575 case UCHAR_CASE_IGNORABLE:
1576 case UCHAR_CASED:
1577 case UCHAR_CHANGES_WHEN_CASEFOLDED:
1578 case UCHAR_CHANGES_WHEN_CASEMAPPED:
1579 case UCHAR_CHANGES_WHEN_LOWERCASED:
1580 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1581 case UCHAR_CHANGES_WHEN_TITLECASED:
1582 case UCHAR_CHANGES_WHEN_UPPERCASED:
1583 case UCHAR_DASH:
1584 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1585 case UCHAR_DEPRECATED:
1586 case UCHAR_DIACRITIC:
1587 case UCHAR_EMOJI:
1588 case UCHAR_EMOJI_COMPONENT:
1589 case UCHAR_EMOJI_MODIFIER_BASE:
1590 case UCHAR_EMOJI_MODIFIER:
1591 case UCHAR_EMOJI_PRESENTATION:
1592 case UCHAR_EXTENDED_PICTOGRAPHIC:
1593 case UCHAR_EXTENDER:
1594 case UCHAR_GRAPHEME_BASE:
1595 case UCHAR_GRAPHEME_EXTEND:
1596 case UCHAR_HEX_DIGIT:
1597 case UCHAR_ID_CONTINUE:
1598 case UCHAR_ID_START:
1599 case UCHAR_IDEOGRAPHIC:
1600 case UCHAR_IDS_BINARY_OPERATOR:
1601 case UCHAR_IDS_TRINARY_OPERATOR:
1602 case UCHAR_JOIN_CONTROL:
1603 case UCHAR_LOGICAL_ORDER_EXCEPTION:
1604 case UCHAR_LOWERCASE:
1605 case UCHAR_MATH:
1606 case UCHAR_NONCHARACTER_CODE_POINT:
1607 case UCHAR_PATTERN_SYNTAX:
1608 case UCHAR_PATTERN_WHITE_SPACE:
1609 case UCHAR_QUOTATION_MARK:
1610 case UCHAR_RADICAL:
1611 case UCHAR_REGIONAL_INDICATOR:
1612 case UCHAR_S_TERM:
1613 case UCHAR_SOFT_DOTTED:
1614 case UCHAR_TERMINAL_PUNCTUATION:
1615 case UCHAR_UNIFIED_IDEOGRAPH:
1616 case UCHAR_UPPERCASE:
1617 case UCHAR_VARIATION_SELECTOR:
1618 case UCHAR_WHITE_SPACE:
1619 case UCHAR_XID_CONTINUE:
1620 case UCHAR_XID_START:
1621 return true;
1622 default:
1623 break;
1624 }
1625 return false;
1626}
1627
1628bool IsUnicodePropertyValueCharacter(char c) {
1629 // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1630 //
1631 // Note that using this to validate each parsed char is quite conservative.
1632 // A possible alternative solution would be to only ensure the parsed
1633 // property name/value candidate string does not contain '\0' characters and
1634 // let ICU lookups trigger the final failure.
1635 if (Utils::IsAlphaNumeric(c)) return true;
1636 return (c == '_');
1637}
1638
1639} // anonymous namespace
1640
1642 ZoneGrowableArray<char>* name_2) {
1643 ASSERT(name_1->is_empty());
1644 ASSERT(name_2->is_empty());
1645 // Parse the property class as follows:
1646 // - In \p{name}, 'name' is interpreted
1647 // - either as a general category property value name.
1648 // - or as a binary property name.
1649 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1650 // and 'value' is interpreted as one of the available property value names.
1651 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1652 // - Loose matching is not applied.
1653 if (current() == '{') {
1654 // Parse \p{[PropertyName=]PropertyNameValue}
1655 for (Advance(); current() != '}' && current() != '='; Advance()) {
1656 if (!IsUnicodePropertyValueCharacter(current())) return false;
1657 if (!has_next()) return false;
1658 name_1->Add(static_cast<char>(current()));
1659 }
1660 if (current() == '=') {
1661 for (Advance(); current() != '}'; Advance()) {
1662 if (!IsUnicodePropertyValueCharacter(current())) return false;
1663 if (!has_next()) return false;
1664 name_2->Add(static_cast<char>(current()));
1665 }
1666 name_2->Add(0); // null-terminate string.
1667 }
1668 } else {
1669 return false;
1670 }
1671 Advance();
1672 name_1->Add(0); // null-terminate string.
1673
1674 ASSERT(static_cast<size_t>(name_1->length() - 1) == strlen(name_1->data()));
1675 ASSERT(name_2->is_empty() ||
1676 static_cast<size_t>(name_2->length() - 1) == strlen(name_2->data()));
1677 return true;
1678}
1679
1682 bool negate,
1684 ZoneGrowableArray<char>* name_2) {
1685 ASSERT(name_1->At(name_1->length() - 1) == '\0');
1686 ASSERT(name_2->is_empty() || name_2->At(name_2->length() - 1) == '\0');
1687 if (name_2->is_empty()) {
1688 // First attempt to interpret as general category property value name.
1689 const char* name = name_1->data();
1690 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1691 add_to)) {
1692 return true;
1693 }
1694 // Interpret "Any", "ASCII", and "Assigned".
1695 if (LookupSpecialPropertyValueName(name, add_to, negate)) {
1696 return true;
1697 }
1698 // Then attempt to interpret as binary property name with value name 'Y'.
1699 UProperty property = u_getPropertyEnum(name);
1700 if (!IsSupportedBinaryProperty(property)) return false;
1701 if (!IsExactPropertyAlias(name, property)) return false;
1702 return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to);
1703 } else {
1704 // Both property name and value name are specified. Attempt to interpret
1705 // the property name as enumerated property.
1706 const char* property_name = name_1->data();
1707 const char* value_name = name_2->data();
1708 UProperty property = u_getPropertyEnum(property_name);
1709 if (!IsExactPropertyAlias(property_name, property)) return false;
1710 if (property == UCHAR_GENERAL_CATEGORY) {
1711 // We want to allow aggregate value names such as "Letter".
1712 property = UCHAR_GENERAL_CATEGORY_MASK;
1713 } else if (property != UCHAR_SCRIPT &&
1714 property != UCHAR_SCRIPT_EXTENSIONS) {
1715 return false;
1716 }
1717 return LookupPropertyValueName(property, value_name, negate, add_to);
1718 }
1719}
1720
1722 uint32_t* value) {
1723 uint32_t x = 0;
1724 int d = HexValue(current());
1725 if (d < 0) {
1726 return false;
1727 }
1728 while (d >= 0) {
1729 x = x * 16 + d;
1730 if (x > max_value) {
1731 return false;
1732 }
1733 Advance();
1734 d = HexValue(current());
1735 }
1736 *value = x;
1737 return true;
1738}
1739
1741 ASSERT(current() == '\\');
1742 DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));
1743 Advance();
1744 switch (current()) {
1745 case 'b':
1746 Advance();
1747 return '\b';
1748 // ControlEscape :: one of
1749 // f n r t v
1750 case 'f':
1751 Advance();
1752 return '\f';
1753 case 'n':
1754 Advance();
1755 return '\n';
1756 case 'r':
1757 Advance();
1758 return '\r';
1759 case 't':
1760 Advance();
1761 return '\t';
1762 case 'v':
1763 Advance();
1764 return '\v';
1765 case 'c': {
1766 uint32_t controlLetter = Next();
1767 uint32_t letter = controlLetter & ~('A' ^ 'a');
1768 // For compatibility with JSC, inside a character class
1769 // we also accept digits and underscore as control characters.
1770 if (letter >= 'A' && letter <= 'Z') {
1771 Advance(2);
1772 // Control letters mapped to ASCII control characters in the range
1773 // 0x00-0x1f.
1774 return controlLetter & 0x1f;
1775 }
1776 if (is_unicode()) {
1777 // With /u, \c# or \c_ are invalid.
1778 ReportError("Invalid class escape");
1779 UNREACHABLE();
1780 }
1781 if (Utils::IsDecimalDigit(controlLetter) || controlLetter == '_') {
1782 Advance(2);
1783 return controlLetter & 0x1f;
1784 }
1785 // We match JSC in reading the backslash as a literal
1786 // character instead of as starting an escape.
1787 return '\\';
1788 }
1789 case '0':
1790 // With /u, \0 is interpreted as NUL if not followed by another digit.
1791 if (is_unicode() && !(Next() >= '0' && Next() <= '9')) {
1792 Advance();
1793 return 0;
1794 }
1796 case '1':
1797 case '2':
1798 case '3':
1799 case '4':
1800 case '5':
1801 case '6':
1802 case '7':
1803 // For compatibility, we interpret a decimal escape that isn't
1804 // a back reference (and therefore either \0 or not valid according
1805 // to the specification) as a 1..3 digit octal character code.
1806 if (is_unicode()) {
1807 // With \u, decimal escape is not interpreted as octal character code.
1808 ReportError("Invalid class escape");
1809 UNREACHABLE();
1810 }
1811 return ParseOctalLiteral();
1812 case 'x': {
1813 Advance();
1814 uint32_t value;
1815 if (ParseHexEscape(2, &value)) {
1816 return value;
1817 }
1818 if (is_unicode()) {
1819 // With \u, invalid escapes are not treated as identity escapes.
1820 ReportError("Invalid escape");
1821 UNREACHABLE();
1822 }
1823 // If \x is not followed by a two-digit hexadecimal, treat it
1824 // as an identity escape.
1825 return 'x';
1826 }
1827 case 'u': {
1828 Advance();
1829 uint32_t value;
1830 if (ParseUnicodeEscape(&value)) {
1831 return value;
1832 }
1833 if (is_unicode()) {
1834 // With \u, invalid escapes are not treated as identity escapes.
1836 UNREACHABLE();
1837 }
1838 // If \u is not followed by a four-digit hexadecimal, treat it
1839 // as an identity escape.
1840 return 'u';
1841 }
1842 default: {
1843 // Extended identity escape. We accept any character that hasn't
1844 // been matched by a more specific case, not just the subset required
1845 // by the ECMAScript specification.
1846 uint32_t result = current();
1847 if (!is_unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1848 Advance();
1849 return result;
1850 }
1852 UNREACHABLE();
1853 }
1854 }
1855 return 0;
1856}
1857
1859 bool add_unicode_case_equivalents,
1860 uint32_t* char_out) {
1861 uint32_t first = current();
1862 if (first == '\\') {
1863 switch (Next()) {
1864 case 'w':
1865 case 'W':
1866 case 'd':
1867 case 'D':
1868 case 's':
1869 case 'S': {
1870 CharacterRange::AddClassEscape(static_cast<uint16_t>(Next()), ranges,
1871 add_unicode_case_equivalents);
1872 Advance(2);
1873 return true;
1874 }
1875 case 'p':
1876 case 'P': {
1877 if (!is_unicode()) break;
1878 bool negate = Next() == 'P';
1879 Advance(2);
1880 auto name_1 = new (Z) ZoneGrowableArray<char>();
1881 auto name_2 = new (Z) ZoneGrowableArray<char>();
1882 if (!ParsePropertyClassName(name_1, name_2) ||
1883 !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1884 ReportError("Invalid property name in character class");
1885 UNREACHABLE();
1886 }
1887 return true;
1888 }
1889 case kEndMarker:
1890 ReportError("\\ at end of pattern");
1891 UNREACHABLE();
1892 default:
1893 break;
1894 }
1895 *char_out = ParseClassCharacterEscape();
1896 return false;
1897 }
1898 Advance();
1899 *char_out = first;
1900 return false;
1901}
1902
1904 static const char* kUnterminated = "Unterminated character class";
1905 static const char* kRangeInvalid = "Invalid character class";
1906 static const char* kRangeOutOfOrder = "Range out of order in character class";
1907
1908 ASSERT(current() == '[');
1909 Advance();
1910 bool is_negated = false;
1911 if (current() == '^') {
1912 is_negated = true;
1913 Advance();
1914 }
1917 bool add_unicode_case_equivalents = is_unicode() && builder->ignore_case();
1918 while (has_more() && current() != ']') {
1919 uint32_t char_1 = 0;
1920 bool is_class_1 =
1921 ParseClassEscape(ranges, add_unicode_case_equivalents, &char_1);
1922 if (current() == '-') {
1923 Advance();
1924 if (current() == kEndMarker) {
1925 // If we reach the end we break out of the loop and let the
1926 // following code report an error.
1927 break;
1928 } else if (current() == ']') {
1929 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1930 ranges->Add(CharacterRange::Singleton('-'));
1931 break;
1932 }
1933 uint32_t char_2 = 0;
1934 bool is_class_2 =
1935 ParseClassEscape(ranges, add_unicode_case_equivalents, &char_2);
1936 if (is_class_1 || is_class_2) {
1937 // Either end is an escaped character class. Treat the '-' verbatim.
1938 if (is_unicode()) {
1939 // ES2015 21.2.2.15.1 step 1.
1940 ReportError(kRangeInvalid);
1941 UNREACHABLE();
1942 }
1943 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1944 ranges->Add(CharacterRange::Singleton('-'));
1945 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2));
1946 continue;
1947 }
1948 if (char_1 > char_2) {
1949 ReportError(kRangeOutOfOrder);
1950 UNREACHABLE();
1951 }
1952 ranges->Add(CharacterRange::Range(char_1, char_2));
1953 } else {
1954 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1));
1955 }
1956 }
1957 if (!has_more()) {
1958 ReportError(kUnterminated);
1959 UNREACHABLE();
1960 }
1961 Advance();
1962 RegExpCharacterClass::CharacterClassFlags character_class_flags =
1964 if (is_negated) character_class_flags |= RegExpCharacterClass::NEGATED;
1965 return new (Z)
1966 RegExpCharacterClass(ranges, builder->flags(), character_class_flags);
1967}
1968
1969// ----------------------------------------------------------------------------
1970// The Parser interface.
1971
1975 ASSERT(result != nullptr);
1976 RegExpParser parser(input, &result->error, flags);
1977 // Throws an exception if 'input' is not valid.
1978 RegExpTree* tree = parser.ParsePattern();
1979 ASSERT(tree != nullptr);
1980 ASSERT(result->error.IsNull());
1981 result->tree = tree;
1982 intptr_t capture_count = parser.captures_started();
1983 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1984 result->contains_anchor = parser.contains_anchor();
1985 result->capture_name_map = parser.CreateCaptureNameMap();
1986 result->capture_count = capture_count;
1987}
1988
1989} // namespace dart
static void done(const char *config, const char *src, const char *srcOptions, const char *name)
Definition: DM.cpp:263
SkPoint pos
static float next(float f)
#define UNREACHABLE()
Definition: assert.h:248
#define DEBUG_ASSERT(cond)
Definition: assert.h:321
#define N
Definition: beziers.cpp:19
static ArrayPtr New(intptr_t len, Heap::Space space=Heap::kNew)
Definition: object.h:10959
void AddArray(const BaseGrowableArray< T, B, Allocator > &src)
void Add(const T &value)
const T & At(intptr_t index) const
intptr_t length() const
static void AddClassEscape(uint16_t type, ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:4651
static CharacterRange Range(int32_t from, int32_t to)
Definition: regexp.h:40
static CharacterRange Everything()
Definition: regexp.h:44
static CharacterRange Singleton(int32_t value)
Definition: regexp.h:37
static void Canonicalize(ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:4876
static void Negate(ZoneGrowableArray< CharacterRange > *src, ZoneGrowableArray< CharacterRange > *dst)
Definition: regexp.cc:4912
static ZoneGrowableArray< CharacterRange > * List(Zone *zone, CharacterRange range)
Definition: regexp.h:47
static DART_NORETURN void ThrowByType(ExceptionType type, const Array &arguments)
Definition: exceptions.cc:1052
static ObjectPtr null()
Definition: object.h:433
static Object & Handle()
Definition: object.h:407
void AddAssertion(RegExpTree *tree)
void AddCharacterClass(RegExpCharacterClass *cc)
bool AddQuantifierToAtom(intptr_t min, intptr_t max, RegExpQuantifier::QuantifierType type)
void AddCharacterClassForDesugaring(uint32_t c)
void AddTerm(RegExpTree *tree)
RegExpBuilder(RegExpFlags flags)
RegExpTree * ToRegExp()
void AddEscapedUnicodeCharacter(uint32_t character)
bool ignore_case() const
Definition: regexp_parser.h:39
void AddUnicodeCharacter(uint32_t character)
void AddAtom(RegExpTree *tree)
void AddCharacter(uint16_t character)
void set_body(RegExpTree *body)
Definition: regexp_ast.h:333
static CharacterClassFlags DefaultFlags()
Definition: regexp_ast.h:164
static RegExpEmpty * GetInstance()
Definition: regexp_ast.h:440
uint32_t ParseOctalLiteral()
void ReportError(const char *message)
void Reset(intptr_t pos)
bool is_unicode() const
RegExpTree * ParseDisjunction()
RegExpParser(const String &in, String *error, RegExpFlags regexp_flags)
bool ParseClassEscape(ZoneGrowableArray< CharacterRange > *ranges, bool add_unicode_case_equivalents, uint32_t *char_out)
RegExpTree * ParsePattern()
bool ParseUnlimitedLengthHexNumber(uint32_t max_value, uint32_t *value)
bool ParseHexEscape(intptr_t length, uint32_t *value)
uint32_t ParseClassCharacterEscape()
bool ParseIntervalQuantifier(intptr_t *min_out, intptr_t *max_out)
static bool IsSyntaxCharacterOrSlash(uint32_t c)
bool AddPropertyClassRange(ZoneGrowableArray< CharacterRange > *add_to, bool negate, ZoneGrowableArray< char > *name_1, ZoneGrowableArray< char > *name_2)
static constexpr uint32_t kEndMarker
bool ParseUnicodeEscape(uint32_t *value)
bool ParseBackReferenceIndex(intptr_t *index_out)
RegExpTree * ParseCharacterClass(const RegExpBuilder *builder)
static constexpr intptr_t kMaxCaptures
static void ParseRegExp(const String &input, RegExpFlags regexp_flags, RegExpCompileData *result)
bool ParsePropertyClassName(ZoneGrowableArray< char > *name_1, ZoneGrowableArray< char > *name_2)
intptr_t captures_started()
virtual intptr_t max_match() const =0
static constexpr intptr_t kInfinity
Definition: regexp_ast.h:39
virtual bool IsTextElement() const
Definition: regexp_ast.h:44
static SmiPtr New(intptr_t value)
Definition: object.h:10006
intptr_t Length() const
Definition: object.h:10210
static StringPtr ConcatAll(const Array &strings, Heap::Space space=Heap::kNew)
Definition: object.cc:24048
static StringPtr New(const char *cstr, Heap::Space space=Heap::kNew)
Definition: object.cc:23698
uint16_t CharAt(intptr_t index) const
Definition: object.h:10259
static StringPtr FromUTF16(const uint16_t *utf16_array, intptr_t array_len, Heap::Space space=Heap::kNew)
Definition: object.cc:23739
static const String & Blank()
Definition: symbols.h:647
static constexpr int32_t kLeadSurrogateStart
Definition: unicode.h:159
static constexpr int32_t kMaxCodeUnit
Definition: unicode.h:158
static int32_t Decode(uint16_t lead, uint16_t trail)
Definition: unicode.h:151
static void Encode(int32_t codepoint, uint16_t *dst)
Definition: unicode.cc:273
static bool IsLeadSurrogate(uint32_t ch)
Definition: unicode.h:126
static bool IsTrailSurrogate(uint32_t ch)
Definition: unicode.h:131
static constexpr int32_t kMaxCodePoint
Definition: unicode.h:18
static constexpr bool IsDecimalDigit(uint32_t c)
Definition: utils.h:386
static constexpr bool IsAlphaNumeric(uint32_t c)
Definition: utils.h:381
#define ASSERT(E)
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE auto & d
Definition: main.cc:19
struct MyStruct s
AtkStateType state
FlutterSemanticsFlag flags
G_BEGIN_DECLS G_MODULE_EXPORT FlValue * args
const uint8_t uint32_t uint32_t GError ** error
uint8_t value
GAsyncResult * result
static float max(float r, float g, float b)
Definition: hsl.cpp:49
static float min(float r, float g, float b)
Definition: hsl.cpp:48
size_t length
std::u16string text
Win32Message message
double x
Definition: dart_vm.cc:33
const char *const name
static void PushCodeUnit(RegExpCaptureName *v, uint32_t code_unit)
static const char * kUnicodeIdentity
ZoneGrowableArray< uint16_t > RegExpCaptureName
Definition: regexp_parser.h:73
static int HexValue(char digit)
Definition: uri.cc:86
static constexpr bool FLAG_regexp_possessive_quantifier
DEF_SWITCHES_START aot vmservice shared library Name of the *so containing AOT compiled Dart assets for launching the service isolate vm snapshot The VM snapshot data that will be memory mapped as read only SnapshotAssetPath must be present isolate snapshot The isolate snapshot data that will be memory mapped as read only SnapshotAssetPath must be present cache dir Path to the cache directory This is different from the persistent_cache_path in embedder which is used for Skia shader cache icu native lib Path to the library file that exports the ICU data vm service The hostname IP address on which the Dart VM Service should be served If not set
Definition: switches.h:76
parser
Definition: zip.py:78
#define FALL_THROUGH
Definition: globals.h:15
#define Z
#define LAST(x)
Definition: regexp_parser.h:69