Flutter Engine
The Flutter Engine
Loading...
Searching...
No Matches
regexp.cc
Go to the documentation of this file.
1// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#include "vm/regexp.h"
6
7#include <memory>
8
10#include "platform/unicode.h"
11
12#include "unicode/uniset.h"
13
14#include "vm/dart_entry.h"
15#include "vm/regexp_assembler.h"
17#include "vm/regexp_ast.h"
18#include "vm/symbols.h"
19#include "vm/thread.h"
20#include "vm/unibrow-inl.h"
21
22#if !defined(DART_PRECOMPILED_RUNTIME)
24#endif // !defined(DART_PRECOMPILED_RUNTIME)
25
26#define Z (zone())
27
28namespace dart {
29
30// Default to generating optimized regexp code.
31static constexpr bool kRegexpOptimization = true;
32
33// More makes code generation slower, less makes V8 benchmark score lower.
34static constexpr intptr_t kMaxLookaheadForBoyerMoore = 8;
35
37 const int32_t* ranges,
38 intptr_t ranges_length,
39 Interval new_range) {
40 ASSERT((ranges_length & 1) == 1);
41 ASSERT(ranges[ranges_length - 1] == Utf::kMaxCodePoint + 1);
42 if (containment == kLatticeUnknown) return containment;
43 bool inside = false;
44 int32_t last = 0;
45 for (intptr_t i = 0; i < ranges_length;
46 inside = !inside, last = ranges[i], i++) {
47 // Consider the range from last to ranges[i].
48 // We haven't got to the new range yet.
49 if (ranges[i] <= new_range.from()) continue;
50 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
51 // inclusive, but the values in ranges are not.
52 if (last <= new_range.from() && new_range.to() < ranges[i]) {
53 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
54 }
55 return kLatticeUnknown;
56 }
57 return containment;
58}
59
60// -------------------------------------------------------------------
61// Implementation of the Irregexp regular expression engine.
62//
63// The Irregexp regular expression engine is intended to be a complete
64// implementation of ECMAScript regular expressions. It generates
65// IR code that is subsequently compiled to native code.
66
67// The Irregexp regexp engine is structured in three steps.
68// 1) The parser generates an abstract syntax tree. See regexp_ast.cc.
69// 2) From the AST a node network is created. The nodes are all
70// subclasses of RegExpNode. The nodes represent states when
71// executing a regular expression. Several optimizations are
72// performed on the node network.
73// 3) From the nodes we generate IR instructions that can actually
74// execute the regular expression (perform the search). The
75// code generation step is described in more detail below.
76
77// Code generation.
78//
79// The nodes are divided into four main categories.
80// * Choice nodes
81// These represent places where the regular expression can
82// match in more than one way. For example on entry to an
83// alternation (foo|bar) or a repetition (*, +, ? or {}).
84// * Action nodes
85// These represent places where some action should be
86// performed. Examples include recording the current position
87// in the input string to a register (in order to implement
88// captures) or other actions on register for example in order
89// to implement the counters needed for {} repetitions.
90// * Matching nodes
91// These attempt to match some element part of the input string.
92// Examples of elements include character classes, plain strings
93// or back references.
94// * End nodes
95// These are used to implement the actions required on finding
96// a successful match or failing to find a match.
97//
98// The code generated maintains some state as it runs. This consists of the
99// following elements:
100//
101// * The capture registers. Used for string captures.
102// * Other registers. Used for counters etc.
103// * The current position.
104// * The stack of backtracking information. Used when a matching node
105// fails to find a match and needs to try an alternative.
106//
107// Conceptual regular expression execution model:
108//
109// There is a simple conceptual model of regular expression execution
110// which will be presented first. The actual code generated is a more
111// efficient simulation of the simple conceptual model:
112//
113// * Choice nodes are implemented as follows:
114// For each choice except the last {
115// push current position
116// push backtrack code location
117// <generate code to test for choice>
118// backtrack code location:
119// pop current position
120// }
121// <generate code to test for last choice>
122//
123// * Actions nodes are generated as follows
124// <push affected registers on backtrack stack>
125// <generate code to perform action>
126// push backtrack code location
127// <generate code to test for following nodes>
128// backtrack code location:
129// <pop affected registers to restore their state>
130// <pop backtrack location from stack and go to it>
131//
132// * Matching nodes are generated as follows:
133// if input string matches at current position
134// update current position
135// <generate code to test for following nodes>
136// else
137// <pop backtrack location from stack and go to it>
138//
139// Thus it can be seen that the current position is saved and restored
140// by the choice nodes, whereas the registers are saved and restored by
141// by the action nodes that manipulate them.
142//
143// The other interesting aspect of this model is that nodes are generated
144// at the point where they are needed by a recursive call to Emit(). If
145// the node has already been code generated then the Emit() call will
146// generate a jump to the previously generated code instead. In order to
147// limit recursion it is possible for the Emit() function to put the node
148// on a work list for later generation and instead generate a jump. The
149// destination of the jump is resolved later when the code is generated.
150//
151// Actual regular expression code generation.
152//
153// Code generation is actually more complicated than the above. In order
154// to improve the efficiency of the generated code some optimizations are
155// performed
156//
157// * Choice nodes have 1-character lookahead.
158// A choice node looks at the following character and eliminates some of
159// the choices immediately based on that character. This is not yet
160// implemented.
161// * Simple greedy loops store reduced backtracking information.
162// A quantifier like /.*foo/m will greedily match the whole input. It will
163// then need to backtrack to a point where it can match "foo". The naive
164// implementation of this would push each character position onto the
165// backtracking stack, then pop them off one by one. This would use space
166// proportional to the length of the input string. However since the "."
167// can only match in one way and always has a constant length (in this case
168// of 1) it suffices to store the current position on the top of the stack
169// once. Matching now becomes merely incrementing the current position and
170// backtracking becomes decrementing the current position and checking the
171// result against the stored current position. This is faster and saves
172// space.
173// * The current state is virtualized.
174// This is used to defer expensive operations until it is clear that they
175// are needed and to generate code for a node more than once, allowing
176// specialized an efficient versions of the code to be created. This is
177// explained in the section below.
178//
179// Execution state virtualization.
180//
181// Instead of emitting code, nodes that manipulate the state can record their
182// manipulation in an object called the Trace. The Trace object can record a
183// current position offset, an optional backtrack code location on the top of
184// the virtualized backtrack stack and some register changes. When a node is
185// to be emitted it can flush the Trace or update it. Flushing the Trace
186// will emit code to bring the actual state into line with the virtual state.
187// Avoiding flushing the state can postpone some work (e.g. updates of capture
188// registers). Postponing work can save time when executing the regular
189// expression since it may be found that the work never has to be done as a
190// failure to match can occur. In addition it is much faster to jump to a
191// known backtrack code location than it is to pop an unknown backtrack
192// location from the stack and jump there.
193//
194// The virtual state found in the Trace affects code generation. For example
195// the virtual state contains the difference between the actual current
196// position and the virtual current position, and matching code needs to use
197// this offset to attempt a match in the correct location of the input
198// string. Therefore code generated for a non-trivial trace is specialized
199// to that trace. The code generator therefore has the ability to generate
200// code for each node several times. In order to limit the size of the
201// generated code there is an arbitrary limit on how many specialized sets of
202// code may be generated for a given node. If the limit is reached, the
203// trace is flushed and a generic version of the code for a node is emitted.
204// This is subsequently used for that node. The code emitted for non-generic
205// trace is not recorded in the node and so it cannot currently be reused in
206// the event that code generation is requested for an identical trace.
207
211
213 text->AddElement(TextElement::Atom(this));
214}
215
219
221 for (intptr_t i = 0; i < elements()->length(); i++)
222 text->AddElement((*elements())[i]);
223}
224
228
232
233intptr_t TextElement::length() const {
234 switch (text_type()) {
235 case ATOM:
236 return atom()->length();
237
238 case CHAR_CLASS:
239 return 1;
240 }
241 UNREACHABLE();
242 return 0;
243}
244
246 public:
247 FrequencyCollator() : total_samples_(0) {
248 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
249 frequencies_[i] = CharacterFrequency(i);
250 }
251 }
252
253 void CountCharacter(intptr_t character) {
254 intptr_t index = (character & RegExpMacroAssembler::kTableMask);
255 frequencies_[index].Increment();
256 total_samples_++;
257 }
258
259 // Does not measure in percent, but rather per-128 (the table size from the
260 // regexp macro assembler).
261 intptr_t Frequency(intptr_t in_character) {
262 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
263 if (total_samples_ < 1) return 1; // Division by zero.
264 intptr_t freq_in_per128 =
265 (frequencies_[in_character].counter() * 128) / total_samples_;
266 return freq_in_per128;
267 }
268
269 private:
270 class CharacterFrequency {
271 public:
272 CharacterFrequency() : counter_(0), character_(-1) {}
273 explicit CharacterFrequency(intptr_t character)
274 : counter_(0), character_(character) {}
275
276 void Increment() { counter_++; }
277 intptr_t counter() { return counter_; }
278 intptr_t character() { return character_; }
279
280 private:
281 intptr_t counter_;
282 intptr_t character_;
283
285 };
286
287 private:
288 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
289 intptr_t total_samples_;
290};
291
293 public:
294 RegExpCompiler(intptr_t capture_count, bool is_one_byte);
295
296 intptr_t AllocateRegister() { return next_register_++; }
297
298 // Lookarounds to match lone surrogates for unicode character class matches
299 // are never nested. We can therefore reuse registers.
301 if (unicode_lookaround_stack_register_ == kNoRegister) {
302 unicode_lookaround_stack_register_ = AllocateRegister();
303 }
304 return unicode_lookaround_stack_register_;
305 }
306
308 if (unicode_lookaround_position_register_ == kNoRegister) {
309 unicode_lookaround_position_register_ = AllocateRegister();
310 }
311 return unicode_lookaround_position_register_;
312 }
313
314#if !defined(DART_PRECOMPILED_RUNTIME)
317 intptr_t capture_count,
318 const String& pattern);
319#endif
320
324 intptr_t capture_count,
325 const String& pattern);
326
327 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
328
329 static constexpr intptr_t kImplementationOffset = 0;
330 static constexpr intptr_t kNumberOfRegistersOffset = 0;
331 static constexpr intptr_t kCodeOffset = 1;
332
333 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
334 EndNode* accept() { return accept_; }
335
336 static constexpr intptr_t kMaxRecursion = 100;
337 inline intptr_t recursion_depth() { return recursion_depth_; }
338 inline void IncrementRecursionDepth() { recursion_depth_++; }
339 inline void DecrementRecursionDepth() { recursion_depth_--; }
340
341 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
342
343 inline bool one_byte() const { return is_one_byte_; }
344 bool read_backward() { return read_backward_; }
345 void set_read_backward(bool value) { read_backward_ = value; }
346 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
347
348 intptr_t current_expansion_factor() { return current_expansion_factor_; }
349 void set_current_expansion_factor(intptr_t value) {
350 current_expansion_factor_ = value;
351 }
352
353 Zone* zone() const { return zone_; }
354
355 static constexpr intptr_t kNoRegister = -1;
356
357 private:
358 EndNode* accept_;
359 intptr_t next_register_;
360 intptr_t unicode_lookaround_stack_register_;
361 intptr_t unicode_lookaround_position_register_;
363 intptr_t recursion_depth_;
364 RegExpMacroAssembler* macro_assembler_;
365 bool is_one_byte_;
366 bool reg_exp_too_big_;
367 bool read_backward_;
368 intptr_t current_expansion_factor_;
369 FrequencyCollator frequency_collator_;
370 Zone* zone_;
371};
372
374 public:
376 compiler->IncrementRecursionDepth();
377 }
379
380 private:
381 RegExpCompiler* compiler_;
382};
383
387
388// Attempts to compile the regexp using an Irregexp code generator. Returns
389// a fixed array or a null handle depending on whether it succeeded.
390RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool is_one_byte)
391 : next_register_(2 * (capture_count + 1)),
392 unicode_lookaround_stack_register_(kNoRegister),
393 unicode_lookaround_position_register_(kNoRegister),
394 work_list_(nullptr),
395 recursion_depth_(0),
396 is_one_byte_(is_one_byte),
397 reg_exp_too_big_(false),
398 read_backward_(false),
399 current_expansion_factor_(1),
400 zone_(Thread::Current()->zone()) {
401 accept_ = new (Z) EndNode(EndNode::ACCEPT, Z);
402}
403
404#if !defined(DART_PRECOMPILED_RUNTIME)
406 IRRegExpMacroAssembler* macro_assembler,
408 intptr_t capture_count,
409 const String& pattern) {
410 macro_assembler->set_slow_safe(false /* use_slow_safe_regexp_compiler */);
411 macro_assembler_ = macro_assembler;
412
414 work_list_ = &work_list;
416 macro_assembler_->PushBacktrack(&fail);
417 Trace new_trace;
418 start->Emit(this, &new_trace);
419 macro_assembler_->BindBlock(&fail);
420 macro_assembler_->Fail();
421 while (!work_list.is_empty()) {
422 work_list.RemoveLast()->Emit(this, &new_trace);
423 }
424 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
425
426 macro_assembler->GenerateBacktrackBlock();
427 macro_assembler->FinalizeRegistersArray();
428
430 macro_assembler->backtrack_goto(), macro_assembler->graph_entry(),
431 macro_assembler->num_blocks(), macro_assembler->num_stack_locals(),
432 next_register_);
433}
434#endif
435
437 BytecodeRegExpMacroAssembler* macro_assembler,
439 intptr_t capture_count,
440 const String& pattern) {
441 macro_assembler->set_slow_safe(false /* use_slow_safe_regexp_compiler */);
442 macro_assembler_ = macro_assembler;
443
445 work_list_ = &work_list;
447 macro_assembler_->PushBacktrack(&fail);
448 Trace new_trace;
449 start->Emit(this, &new_trace);
450 macro_assembler_->BindBlock(&fail);
451 macro_assembler_->Fail();
452 while (!work_list.is_empty()) {
453 work_list.RemoveLast()->Emit(this, &new_trace);
454 }
455 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
456
457 TypedData& bytecode = TypedData::ZoneHandle(macro_assembler->GetBytecode());
458 return RegExpEngine::CompilationResult(&bytecode, next_register_);
459}
460
463 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
464 return range.Contains(that);
465 } else {
466 return reg() == that;
467 }
468}
469
470bool Trace::mentions_reg(intptr_t reg) {
471 for (DeferredAction* action = actions_; action != nullptr;
472 action = action->next()) {
473 if (action->Mentions(reg)) return true;
474 }
475 return false;
476}
477
478bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {
479 ASSERT(*cp_offset == 0);
480 for (DeferredAction* action = actions_; action != nullptr;
481 action = action->next()) {
482 if (action->Mentions(reg)) {
483 if (action->action_type() == ActionNode::STORE_POSITION) {
484 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
485 return true;
486 } else {
487 return false;
488 }
489 }
490 }
491 return false;
492}
493
494// This is called as we come into a loop choice node and some other tricky
495// nodes. It normalizes the state of the code generator to ensure we can
496// generate generic code.
497intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers, Zone* zone) {
498 intptr_t max_register = RegExpCompiler::kNoRegister;
499 for (DeferredAction* action = actions_; action != nullptr;
500 action = action->next()) {
501 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
502 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
503 for (intptr_t i = range.from(); i <= range.to(); i++)
504 affected_registers->Set(i, zone);
505 if (range.to() > max_register) max_register = range.to();
506 } else {
507 affected_registers->Set(action->reg(), zone);
508 if (action->reg() > max_register) max_register = action->reg();
509 }
510 }
511 return max_register;
512}
513
514void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
515 intptr_t max_register,
516 const OutSet& registers_to_pop,
517 const OutSet& registers_to_clear) {
518 for (intptr_t reg = max_register; reg >= 0; reg--) {
519 if (registers_to_pop.Get(reg)) {
520 assembler->PopRegister(reg);
521 } else if (registers_to_clear.Get(reg)) {
522 intptr_t clear_to = reg;
523 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
524 reg--;
525 }
526 assembler->ClearRegisters(reg, clear_to);
527 }
528 }
529}
530
531void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
532 intptr_t max_register,
533 const OutSet& affected_registers,
534 OutSet* registers_to_pop,
535 OutSet* registers_to_clear,
536 Zone* zone) {
537 for (intptr_t reg = 0; reg <= max_register; reg++) {
538 if (!affected_registers.Get(reg)) {
539 continue;
540 }
541
542 // The chronologically first deferred action in the trace
543 // is used to infer the action needed to restore a register
544 // to its previous state (or not, if it's safe to ignore it).
545 enum DeferredActionUndoType { ACTION_IGNORE, ACTION_RESTORE, ACTION_CLEAR };
546 DeferredActionUndoType undo_action = ACTION_IGNORE;
547
548 intptr_t value = 0;
549 bool absolute = false;
550 bool clear = false;
551 const intptr_t kNoStore = kMinInt32;
552 intptr_t store_position = kNoStore;
553 // This is a little tricky because we are scanning the actions in reverse
554 // historical order (newest first).
555 for (DeferredAction* action = actions_; action != nullptr;
556 action = action->next()) {
557 if (action->Mentions(reg)) {
558 switch (action->action_type()) {
560 Trace::DeferredSetRegister* psr =
561 static_cast<Trace::DeferredSetRegister*>(action);
562 if (!absolute) {
563 value += psr->value();
564 absolute = true;
565 }
566 // SET_REGISTER is currently only used for newly introduced loop
567 // counters. They can have a significant previous value if they
568 // occur in a loop. TODO(lrn): Propagate this information, so we
569 // can set undo_action to ACTION_IGNORE if we know there is no
570 // value to restore.
571 undo_action = ACTION_RESTORE;
572 ASSERT(store_position == kNoStore);
573 ASSERT(!clear);
574 break;
575 }
577 if (!absolute) {
578 value++;
579 }
580 ASSERT(store_position == kNoStore);
581 ASSERT(!clear);
582 undo_action = ACTION_RESTORE;
583 break;
585 Trace::DeferredCapture* pc =
586 static_cast<Trace::DeferredCapture*>(action);
587 if (!clear && store_position == kNoStore) {
588 store_position = pc->cp_offset();
589 }
590
591 // For captures we know that stores and clears alternate.
592 // Other register, are never cleared, and if the occur
593 // inside a loop, they might be assigned more than once.
594 if (reg <= 1) {
595 // Registers zero and one, aka "capture zero", is
596 // always set correctly if we succeed. There is no
597 // need to undo a setting on backtrack, because we
598 // will set it again or fail.
599 undo_action = ACTION_IGNORE;
600 } else {
601 undo_action = pc->is_capture() ? ACTION_CLEAR : ACTION_RESTORE;
602 }
603 ASSERT(!absolute);
604 ASSERT(value == 0);
605 break;
606 }
608 // Since we're scanning in reverse order, if we've already
609 // set the position we have to ignore historically earlier
610 // clearing operations.
611 if (store_position == kNoStore) {
612 clear = true;
613 }
614 undo_action = ACTION_RESTORE;
615 ASSERT(!absolute);
616 ASSERT(value == 0);
617 break;
618 }
619 default:
620 UNREACHABLE();
621 break;
622 }
623 }
624 }
625 // Prepare for the undo-action (e.g., push if it's going to be popped).
626 if (undo_action == ACTION_RESTORE) {
627 assembler->PushRegister(reg);
628 registers_to_pop->Set(reg, zone);
629 } else if (undo_action == ACTION_CLEAR) {
630 registers_to_clear->Set(reg, zone);
631 }
632 // Perform the chronologically last action (or accumulated increment)
633 // for the register.
634 if (store_position != kNoStore) {
635 assembler->WriteCurrentPositionToRegister(reg, store_position);
636 } else if (clear) {
637 assembler->ClearRegisters(reg, reg);
638 } else if (absolute) {
639 assembler->SetRegister(reg, value);
640 } else if (value != 0) {
641 assembler->AdvanceRegister(reg, value);
642 }
643 }
644}
645
646// This is called as we come into a loop choice node and some other tricky
647// nodes. It normalizes the state of the code generator to ensure we can
648// generate generic code.
650 RegExpMacroAssembler* assembler = compiler->macro_assembler();
651
652 ASSERT(!is_trivial());
653
654 if (actions_ == nullptr && backtrack() == nullptr) {
655 // Here we just have some deferred cp advances to fix and we are back to
656 // a normal situation. We may also have to forget some information gained
657 // through a quick check that was already performed.
658 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
659 // Create a new trivial state and generate the node with that.
660 Trace new_state;
661 successor->Emit(compiler, &new_state);
662 return;
663 }
664
665 // Generate deferred actions here along with code to undo them again.
666 OutSet affected_registers;
667
668 if (backtrack() != nullptr) {
669 // Here we have a concrete backtrack location. These are set up by choice
670 // nodes and so they indicate that we have a deferred save of the current
671 // position which we may need to emit here.
672 assembler->PushCurrentPosition();
673 }
674 Zone* zone = successor->zone();
675 intptr_t max_register = FindAffectedRegisters(&affected_registers, zone);
676 OutSet registers_to_pop;
677 OutSet registers_to_clear;
678 PerformDeferredActions(assembler, max_register, affected_registers,
679 &registers_to_pop, &registers_to_clear, zone);
680 if (cp_offset_ != 0) {
681 assembler->AdvanceCurrentPosition(cp_offset_);
682 }
683
684 // Create a new trivial state and generate the node with that.
685 BlockLabel undo;
686 assembler->PushBacktrack(&undo);
687 Trace new_state;
688 successor->Emit(compiler, &new_state);
689
690 // On backtrack we need to restore state.
691 assembler->BindBlock(&undo);
692 RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
693 registers_to_clear);
694 if (backtrack() == nullptr) {
695 assembler->Backtrack();
696 } else {
697 assembler->PopCurrentPosition();
698 assembler->GoTo(backtrack());
699 }
700}
701
703 RegExpMacroAssembler* assembler = compiler->macro_assembler();
704
705 // Omit flushing the trace. We discard the entire stack frame anyway.
706
707 if (!label()->is_bound()) {
708 // We are completely independent of the trace, since we ignore it,
709 // so this code can be used as the generic version.
710 assembler->BindBlock(label());
711 }
712
713 // Throw away everything on the backtrack stack since the start
714 // of the negative submatch and restore the character position.
715 assembler->ReadCurrentPositionFromRegister(current_position_register_);
716 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
717 if (clear_capture_count_ > 0) {
718 // Clear any captures that might have been performed during the success
719 // of the body of the negative look-ahead.
720 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
721 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
722 }
723 // Now that we have unwound the stack we find at the top of the stack the
724 // backtrack that the BeginSubmatch node got.
725 assembler->Backtrack();
726}
727
729 if (!trace->is_trivial()) {
730 trace->Flush(compiler, this);
731 return;
732 }
733 RegExpMacroAssembler* assembler = compiler->macro_assembler();
734 if (!label()->is_bound()) {
735 assembler->BindBlock(label());
736 }
737 switch (action_) {
738 case ACCEPT:
739 assembler->Succeed();
740 return;
741 case BACKTRACK:
742 assembler->GoTo(trace->backtrack());
743 return;
744 case NEGATIVE_SUBMATCH_SUCCESS:
745 // This case is handled in a different virtual method.
746 UNREACHABLE();
747 }
749}
750
752 if (guards_ == nullptr) guards_ = new (zone) ZoneGrowableArray<Guard*>(1);
753 guards_->Add(guard);
754}
755
757 intptr_t val,
758 RegExpNode* on_success) {
760 new (on_success->zone()) ActionNode(SET_REGISTER, on_success);
761 result->data_.u_store_register.reg = reg;
762 result->data_.u_store_register.value = val;
763 return result;
764}
765
767 RegExpNode* on_success) {
769 new (on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
770 result->data_.u_increment_register.reg = reg;
771 return result;
772}
773
775 bool is_capture,
776 RegExpNode* on_success) {
778 new (on_success->zone()) ActionNode(STORE_POSITION, on_success);
779 result->data_.u_position_register.reg = reg;
780 result->data_.u_position_register.is_capture = is_capture;
781 return result;
782}
783
786 new (on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
787 result->data_.u_clear_captures.range_from = range.from();
788 result->data_.u_clear_captures.range_to = range.to();
789 return result;
790}
791
793 intptr_t position_reg,
794 RegExpNode* on_success) {
796 new (on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
797 result->data_.u_submatch.stack_pointer_register = stack_reg;
798 result->data_.u_submatch.current_position_register = position_reg;
799 return result;
800}
801
803 intptr_t position_reg,
804 intptr_t clear_register_count,
805 intptr_t clear_register_from,
806 RegExpNode* on_success) {
807 ActionNode* result = new (on_success->zone())
808 ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
809 result->data_.u_submatch.stack_pointer_register = stack_reg;
810 result->data_.u_submatch.current_position_register = position_reg;
811 result->data_.u_submatch.clear_register_count = clear_register_count;
812 result->data_.u_submatch.clear_register_from = clear_register_from;
813 return result;
814}
815
817 intptr_t repetition_register,
818 intptr_t repetition_limit,
819 RegExpNode* on_success) {
821 new (on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
822 result->data_.u_empty_match_check.start_register = start_register;
823 result->data_.u_empty_match_check.repetition_register = repetition_register;
824 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
825 return result;
826}
827
828#define DEFINE_ACCEPT(Type) \
829 void Type##Node::Accept(NodeVisitor* visitor) { \
830 visitor->Visit##Type(this); \
831 }
833#undef DEFINE_ACCEPT
834
836 visitor->VisitLoopChoice(this);
837}
838
839// -------------------------------------------------------------------
840// Emit code.
841
842void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
843 Guard* guard,
844 Trace* trace) {
845 switch (guard->op()) {
846 case Guard::LT:
847 ASSERT(!trace->mentions_reg(guard->reg()));
848 macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
849 trace->backtrack());
850 break;
851 case Guard::GEQ:
852 ASSERT(!trace->mentions_reg(guard->reg()));
853 macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
854 trace->backtrack());
855 break;
856 }
857}
858
859// Returns the number of characters in the equivalence class, omitting those
860// that cannot occur in the source string because it is ASCII.
861static intptr_t GetCaseIndependentLetters(uint16_t character,
862 bool one_byte_subject,
863 int32_t* letters) {
865 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters);
866 // Unibrow returns 0 or 1 for characters where case independence is
867 // trivial.
868 if (length == 0) {
869 letters[0] = character;
870 length = 1;
871 }
872 if (!one_byte_subject || character <= Symbols::kMaxOneCharCodeSymbol) {
873 return length;
874 }
875
876 // The standard requires that non-ASCII characters cannot have ASCII
877 // character codes in their equivalence class.
878 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
879 // is it? For example, \u00C5 is equivalent to \u212B.
880 return 0;
881}
882
883static inline bool EmitSimpleCharacter(Zone* zone,
885 uint16_t c,
886 BlockLabel* on_failure,
887 intptr_t cp_offset,
888 bool check,
889 bool preloaded) {
890 RegExpMacroAssembler* assembler = compiler->macro_assembler();
891 bool bound_checked = false;
892 if (!preloaded) {
893 assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
894 bound_checked = true;
895 }
896 assembler->CheckNotCharacter(c, on_failure);
897 return bound_checked;
898}
899
900// Only emits non-letters (things that don't have case). Only used for case
901// independent matches.
902static inline bool EmitAtomNonLetter(Zone* zone,
904 uint16_t c,
905 BlockLabel* on_failure,
906 intptr_t cp_offset,
907 bool check,
908 bool preloaded) {
909 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
910 bool one_byte = compiler->one_byte();
912 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
913 if (length < 1) {
914 // This can't match. Must be an one-byte subject and a non-one-byte
915 // character. We do not need to do anything since the one-byte pass
916 // already handled this.
917 return false; // Bounds not checked.
918 }
919 bool checked = false;
920 // We handle the length > 1 case in a later pass.
921 if (length == 1) {
922 if (one_byte && c > Symbols::kMaxOneCharCodeSymbol) {
923 // Can't match - see above.
924 return false; // Bounds not checked.
925 }
926 if (!preloaded) {
927 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
928 checked = check;
929 }
930 macro_assembler->CheckNotCharacter(c, on_failure);
931 }
932 return checked;
933}
934
936 bool one_byte,
937 uint16_t c1,
938 uint16_t c2,
939 BlockLabel* on_failure) {
940 uint16_t char_mask;
941 if (one_byte) {
943 } else {
944 char_mask = Utf16::kMaxCodeUnit;
945 }
946 uint16_t exor = c1 ^ c2;
947 // Check whether exor has only one bit set.
948 if (((exor - 1) & exor) == 0) {
949 // If c1 and c2 differ only by one bit.
950 // Ecma262UnCanonicalize always gives the highest number last.
951 ASSERT(c2 > c1);
952 uint16_t mask = char_mask ^ exor;
953 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
954 return true;
955 }
956 ASSERT(c2 > c1);
957 uint16_t diff = c2 - c1;
958 if (((diff - 1) & diff) == 0 && c1 >= diff) {
959 // If the characters differ by 2^n but don't differ by one bit then
960 // subtract the difference from the found character, then do the or
961 // trick. We avoid the theoretical case where negative numbers are
962 // involved in order to simplify code generation.
963 uint16_t mask = char_mask ^ diff;
964 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
965 on_failure);
966 return true;
967 }
968 return false;
969}
970
971typedef bool EmitCharacterFunction(Zone* zone,
973 uint16_t c,
974 BlockLabel* on_failure,
975 intptr_t cp_offset,
976 bool check,
977 bool preloaded);
978
979// Only emits letters (things that have case). Only used for case independent
980// matches.
981static inline bool EmitAtomLetter(Zone* zone,
983 uint16_t c,
984 BlockLabel* on_failure,
985 intptr_t cp_offset,
986 bool check,
987 bool preloaded) {
988 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
989 bool one_byte = compiler->one_byte();
991 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
992 if (length <= 1) return false;
993 // We may not need to check against the end of the input string
994 // if this character lies before a character that matched.
995 if (!preloaded) {
996 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
997 }
1000 switch (length) {
1001 case 2: {
1002 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1003 chars[1], on_failure)) {
1004 } else {
1005 macro_assembler->CheckCharacter(chars[0], &ok);
1006 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1007 macro_assembler->BindBlock(&ok);
1008 }
1009 break;
1010 }
1011 case 4:
1012 macro_assembler->CheckCharacter(chars[3], &ok);
1014 case 3:
1015 macro_assembler->CheckCharacter(chars[0], &ok);
1016 macro_assembler->CheckCharacter(chars[1], &ok);
1017 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1018 macro_assembler->BindBlock(&ok);
1019 break;
1020 default:
1021 UNREACHABLE();
1022 break;
1023 }
1024 return true;
1025}
1026
1028 uint16_t border,
1029 BlockLabel* fall_through,
1030 BlockLabel* above_or_equal,
1031 BlockLabel* below) {
1032 if (below != fall_through) {
1033 masm->CheckCharacterLT(border, below);
1034 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1035 } else {
1036 masm->CheckCharacterGT(border - 1, above_or_equal);
1037 }
1038}
1039
1041 uint16_t first,
1042 uint16_t last,
1043 BlockLabel* fall_through,
1044 BlockLabel* in_range,
1045 BlockLabel* out_of_range) {
1046 if (in_range == fall_through) {
1047 if (first == last) {
1048 masm->CheckNotCharacter(first, out_of_range);
1049 } else {
1050 masm->CheckCharacterNotInRange(first, last, out_of_range);
1051 }
1052 } else {
1053 if (first == last) {
1054 masm->CheckCharacter(first, in_range);
1055 } else {
1056 masm->CheckCharacterInRange(first, last, in_range);
1057 }
1058 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1059 }
1060}
1061
1062// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1063// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1066 intptr_t start_index,
1067 intptr_t end_index,
1068 uint16_t min_char,
1069 BlockLabel* fall_through,
1070 BlockLabel* even_label,
1071 BlockLabel* odd_label) {
1072 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1073 const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1074
1075 intptr_t base = (min_char & ~kMask);
1076
1077 // Assert that everything is on one kTableSize page.
1078 for (intptr_t i = start_index; i <= end_index; i++) {
1079 ASSERT((ranges->At(i) & ~kMask) == base);
1080 }
1081 ASSERT(start_index == 0 || (ranges->At(start_index - 1) & ~kMask) <= base);
1082
1083 char templ[kSize];
1084 BlockLabel* on_bit_set;
1085 BlockLabel* on_bit_clear;
1086 intptr_t bit;
1087 if (even_label == fall_through) {
1088 on_bit_set = odd_label;
1089 on_bit_clear = even_label;
1090 bit = 1;
1091 } else {
1092 on_bit_set = even_label;
1093 on_bit_clear = odd_label;
1094 bit = 0;
1095 }
1096 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize;
1097 i++) {
1098 templ[i] = bit;
1099 }
1100 intptr_t j = 0;
1101 bit ^= 1;
1102 for (intptr_t i = start_index; i < end_index; i++) {
1103 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) {
1104 templ[j] = bit;
1105 }
1106 bit ^= 1;
1107 }
1108 for (intptr_t i = j; i < kSize; i++) {
1109 templ[i] = bit;
1110 }
1111 // TODO(erikcorry): Cache these.
1112 const TypedData& ba = TypedData::ZoneHandle(
1113 masm->zone(), TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
1114 for (intptr_t i = 0; i < kSize; i++) {
1115 ba.SetUint8(i, templ[i]);
1116 }
1117 masm->CheckBitInTable(ba, on_bit_set);
1118 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1119}
1120
1123 intptr_t start_index,
1124 intptr_t end_index,
1125 intptr_t cut_index,
1126 BlockLabel* even_label,
1127 BlockLabel* odd_label) {
1128 bool odd = (((cut_index - start_index) & 1) == 1);
1129 BlockLabel* in_range_label = odd ? odd_label : even_label;
1130 BlockLabel dummy;
1131 EmitDoubleBoundaryTest(masm, ranges->At(cut_index),
1132 ranges->At(cut_index + 1) - 1, &dummy, in_range_label,
1133 &dummy);
1134 ASSERT(!dummy.is_linked());
1135 // Cut out the single range by rewriting the array. This creates a new
1136 // range that is a merger of the two ranges on either side of the one we
1137 // are cutting out. The oddity of the labels is preserved.
1138 for (intptr_t j = cut_index; j > start_index; j--) {
1139 (*ranges)[j] = ranges->At(j - 1);
1140 }
1141 for (intptr_t j = cut_index + 1; j < end_index; j++) {
1142 (*ranges)[j] = ranges->At(j + 1);
1143 }
1144}
1145
1146// Unicode case. Split the search space into kSize spaces that are handled
1147// with recursion.
1149 intptr_t start_index,
1150 intptr_t end_index,
1151 intptr_t* new_start_index,
1152 intptr_t* new_end_index,
1153 uint16_t* border) {
1154 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1155 const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1156
1157 uint16_t first = ranges->At(start_index);
1158 uint16_t last = ranges->At(end_index) - 1;
1159
1160 *new_start_index = start_index;
1161 *border = (ranges->At(start_index) & ~kMask) + kSize;
1162 while (*new_start_index < end_index) {
1163 if (ranges->At(*new_start_index) > *border) break;
1164 (*new_start_index)++;
1165 }
1166 // new_start_index is the index of the first edge that is beyond the
1167 // current kSize space.
1168
1169 // For very large search spaces we do a binary chop search of the non-Latin1
1170 // space instead of just going to the end of the current kSize space. The
1171 // heuristics are complicated a little by the fact that any 128-character
1172 // encoding space can be quickly tested with a table lookup, so we don't
1173 // wish to do binary chop search at a smaller granularity than that. A
1174 // 128-character space can take up a lot of space in the ranges array if,
1175 // for example, we only want to match every second character (eg. the lower
1176 // case characters on some Unicode pages).
1177 intptr_t binary_chop_index = (end_index + start_index) / 2;
1178 // The first test ensures that we get to the code that handles the Latin1
1179 // range with a single not-taken branch, speeding up this important
1180 // character range (even non-Latin1 charset-based text has spaces and
1181 // punctuation).
1182 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // Latin1 case.
1183 end_index - start_index > (*new_start_index - start_index) * 2 &&
1184 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1185 ranges->At(binary_chop_index) >= first + 2 * kSize) {
1186 intptr_t scan_forward_for_section_border = binary_chop_index;
1187 intptr_t new_border = (ranges->At(binary_chop_index) | kMask) + 1;
1188
1189 while (scan_forward_for_section_border < end_index) {
1190 if (ranges->At(scan_forward_for_section_border) > new_border) {
1191 *new_start_index = scan_forward_for_section_border;
1192 *border = new_border;
1193 break;
1194 }
1195 scan_forward_for_section_border++;
1196 }
1197 }
1198
1199 ASSERT(*new_start_index > start_index);
1200 *new_end_index = *new_start_index - 1;
1201 if (ranges->At(*new_end_index) == *border) {
1202 (*new_end_index)--;
1203 }
1204 if (*border >= ranges->At(end_index)) {
1205 *border = ranges->At(end_index);
1206 *new_start_index = end_index; // Won't be used.
1207 *new_end_index = end_index - 1;
1208 }
1209}
1210
1211// Gets a series of segment boundaries representing a character class. If the
1212// character is in the range between an even and an odd boundary (counting from
1213// start_index) then go to even_label, otherwise go to odd_label. We already
1214// know that the character is in the range of min_char to max_char inclusive.
1215// Either label can be null indicating backtracking. Either label can also be
1216// equal to the fall_through label.
1219 intptr_t start_index,
1220 intptr_t end_index,
1221 uint16_t min_char,
1222 uint16_t max_char,
1223 BlockLabel* fall_through,
1224 BlockLabel* even_label,
1225 BlockLabel* odd_label) {
1226 uint16_t first = ranges->At(start_index);
1227 uint16_t last = ranges->At(end_index) - 1;
1228
1229 ASSERT(min_char < first);
1230
1231 // Just need to test if the character is before or on-or-after
1232 // a particular character.
1233 if (start_index == end_index) {
1234 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1235 return;
1236 }
1237
1238 // Another almost trivial case: There is one interval in the middle that is
1239 // different from the end intervals.
1240 if (start_index + 1 == end_index) {
1241 EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1242 odd_label);
1243 return;
1244 }
1245
1246 // It's not worth using table lookup if there are very few intervals in the
1247 // character class.
1248 if (end_index - start_index <= 6) {
1249 // It is faster to test for individual characters, so we look for those
1250 // first, then try arbitrary ranges in the second round.
1251 static intptr_t kNoCutIndex = -1;
1252 intptr_t cut = kNoCutIndex;
1253 for (intptr_t i = start_index; i < end_index; i++) {
1254 if (ranges->At(i) == ranges->At(i + 1) - 1) {
1255 cut = i;
1256 break;
1257 }
1258 }
1259 if (cut == kNoCutIndex) cut = start_index;
1260 CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1261 odd_label);
1262 ASSERT(end_index - start_index >= 2);
1263 GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1264 max_char, fall_through, even_label, odd_label);
1265 return;
1266 }
1267
1268 // If there are a lot of intervals in the regexp, then we will use tables to
1269 // determine whether the character is inside or outside the character class.
1271
1272 if ((max_char >> kBits) == (min_char >> kBits)) {
1273 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1274 fall_through, even_label, odd_label);
1275 return;
1276 }
1277
1278 if ((min_char >> kBits) != (first >> kBits)) {
1279 masm->CheckCharacterLT(first, odd_label);
1280 GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1281 fall_through, odd_label, even_label);
1282 return;
1283 }
1284
1285 intptr_t new_start_index = 0;
1286 intptr_t new_end_index = 0;
1287 uint16_t border = 0;
1288
1289 SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1290 &new_end_index, &border);
1291
1292 BlockLabel handle_rest;
1293 BlockLabel* above = &handle_rest;
1294 if (border == last + 1) {
1295 // We didn't find any section that started after the limit, so everything
1296 // above the border is one of the terminal labels.
1297 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1298 ASSERT(new_end_index == end_index - 1);
1299 }
1300
1301 ASSERT(start_index <= new_end_index);
1302 ASSERT(new_start_index <= end_index);
1303 ASSERT(start_index < new_start_index);
1304 ASSERT(new_end_index < end_index);
1305 ASSERT(new_end_index + 1 == new_start_index ||
1306 (new_end_index + 2 == new_start_index &&
1307 border == ranges->At(new_end_index + 1)));
1308 ASSERT(min_char < border - 1);
1309 ASSERT(border < max_char);
1310 ASSERT(ranges->At(new_end_index) < border);
1311 ASSERT(border < ranges->At(new_start_index) ||
1312 (border == ranges->At(new_start_index) &&
1313 new_start_index == end_index && new_end_index == end_index - 1 &&
1314 border == last + 1));
1315 ASSERT(new_start_index == 0 || border >= ranges->At(new_start_index - 1));
1316
1317 masm->CheckCharacterGT(border - 1, above);
1318 BlockLabel dummy;
1319 GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1320 border - 1, &dummy, even_label, odd_label);
1321
1322 if (handle_rest.is_linked()) {
1323 masm->BindBlock(&handle_rest);
1324 bool flip = (new_start_index & 1) != (start_index & 1);
1325 GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1326 &dummy, flip ? odd_label : even_label,
1327 flip ? even_label : odd_label);
1328 }
1329}
1330
1331static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1333 bool one_byte,
1334 BlockLabel* on_failure,
1335 intptr_t cp_offset,
1336 bool check_offset,
1337 bool preloaded,
1338 Zone* zone) {
1339 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
1340 if (!CharacterRange::IsCanonical(ranges)) {
1342 }
1343
1344 uint16_t max_char;
1345 if (one_byte) {
1347 } else {
1348 max_char = Utf16::kMaxCodeUnit;
1349 }
1350
1351 intptr_t range_count = ranges->length();
1352
1353 intptr_t last_valid_range = range_count - 1;
1354 while (last_valid_range >= 0) {
1355 const CharacterRange& range = ranges->At(last_valid_range);
1356 if (range.from() <= max_char) {
1357 break;
1358 }
1359 last_valid_range--;
1360 }
1361
1362 if (last_valid_range < 0) {
1363 if (!cc->is_negated()) {
1364 macro_assembler->GoTo(on_failure);
1365 }
1366 if (check_offset) {
1367 macro_assembler->CheckPosition(cp_offset, on_failure);
1368 }
1369 return;
1370 }
1371
1372 if (last_valid_range == 0 && ranges->At(0).IsEverything(max_char)) {
1373 if (cc->is_negated()) {
1374 macro_assembler->GoTo(on_failure);
1375 } else {
1376 // This is a common case hit by non-anchored expressions.
1377 if (check_offset) {
1378 macro_assembler->CheckPosition(cp_offset, on_failure);
1379 }
1380 }
1381 return;
1382 }
1383
1384 if (!preloaded) {
1385 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1386 }
1387
1388 if (cc->is_standard() && macro_assembler->CheckSpecialCharacterClass(
1389 cc->standard_type(), on_failure)) {
1390 return;
1391 }
1392
1393 // A new list with ascending entries. Each entry is a code unit
1394 // where there is a boundary between code units that are part of
1395 // the class and code units that are not. Normally we insert an
1396 // entry at zero which goes to the failure label, but if there
1397 // was already one there we fall through for success on that entry.
1398 // Subsequent entries have alternating meaning (success/failure).
1399 ZoneGrowableArray<uint16_t>* range_boundaries =
1400 new (zone) ZoneGrowableArray<uint16_t>(last_valid_range);
1401
1402 bool zeroth_entry_is_failure = !cc->is_negated();
1403
1404 for (intptr_t i = 0; i <= last_valid_range; i++) {
1405 const CharacterRange& range = ranges->At(i);
1406 if (range.from() == 0) {
1407 ASSERT(i == 0);
1408 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1409 } else {
1410 range_boundaries->Add(range.from());
1411 }
1412 if (range.to() + 1 <= max_char) {
1413 range_boundaries->Add(range.to() + 1);
1414 }
1415 }
1416 intptr_t end_index = range_boundaries->length() - 1;
1417
1418 BlockLabel fall_through;
1419 GenerateBranches(macro_assembler, range_boundaries,
1420 0, // start_index.
1421 end_index,
1422 0, // min_char.
1423 max_char, &fall_through,
1424 zeroth_entry_is_failure ? &fall_through : on_failure,
1425 zeroth_entry_is_failure ? on_failure : &fall_through);
1426 macro_assembler->BindBlock(&fall_through);
1427}
1428
1430
1432 Trace* trace) {
1433 // If we are generating a greedy loop then don't stop and don't reuse code.
1434 if (trace->stop_node() != nullptr) {
1435 return CONTINUE;
1436 }
1437
1438 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1439 if (trace->is_trivial()) {
1440 if (label_.is_bound()) {
1441 // We are being asked to generate a generic version, but that's already
1442 // been done so just go to it.
1443 macro_assembler->GoTo(&label_);
1444 return DONE;
1445 }
1446 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1447 // To avoid too deep recursion we push the node to the work queue and just
1448 // generate a goto here.
1449 compiler->AddWork(this);
1450 macro_assembler->GoTo(&label_);
1451 return DONE;
1452 }
1453 // Generate generic version of the node and bind the label for later use.
1454 macro_assembler->BindBlock(&label_);
1455 return CONTINUE;
1456 }
1457
1458 // We are being asked to make a non-generic version. Keep track of how many
1459 // non-generic versions we generate so as not to overdo it.
1460 trace_count_++;
1461 if (kRegexpOptimization && trace_count_ < kMaxCopiesCodeGenerated &&
1462 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1463 return CONTINUE;
1464 }
1465
1466 // If we get here code has been generated for this node too many times or
1467 // recursion is too deep. Time to switch to a generic version. The code for
1468 // generic versions above can handle deep recursion properly.
1469 trace->Flush(compiler, this);
1470 return DONE;
1471}
1472
1473intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find,
1474 intptr_t budget,
1475 bool not_at_start) {
1476 if (budget <= 0) return 0;
1477 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1478 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1479}
1480
1482 intptr_t budget,
1484 bool not_at_start) {
1485 if (action_type_ == BEGIN_SUBMATCH) {
1486 bm->SetRest(offset);
1487 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
1488 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1489 }
1490 SaveBMInfo(bm, not_at_start, offset);
1491}
1492
1493intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find,
1494 intptr_t budget,
1495 bool not_at_start) {
1496 if (budget <= 0) return 0;
1497 // If we know we are not at the start and we are asked "how many characters
1498 // will you match if you succeed?" then we can answer anything since false
1499 // implies false. So lets just return the max answer (still_to_find) since
1500 // that won't prevent us from preloading a lot of characters for the other
1501 // branches in the node graph.
1502 if (assertion_type() == AT_START && not_at_start) return still_to_find;
1503 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1504}
1505
1507 intptr_t budget,
1509 bool not_at_start) {
1510 // Match the behaviour of EatsAtLeast on this node.
1511 if (assertion_type() == AT_START && not_at_start) return;
1512 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1513 SaveBMInfo(bm, not_at_start, offset);
1514}
1515
1516intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
1517 intptr_t budget,
1518 bool not_at_start) {
1519 if (read_backward()) return 0;
1520 if (budget <= 0) return 0;
1521 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1522}
1523
1524intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
1525 intptr_t budget,
1526 bool not_at_start) {
1527 if (read_backward()) return 0;
1528 intptr_t answer = Length();
1529 if (answer >= still_to_find) return answer;
1530 if (budget <= 0) return answer;
1531 // We are not at start after this node so we set the last argument to 'true'.
1532 return answer +
1533 on_success()->EatsAtLeast(still_to_find - answer, budget - 1, true);
1534}
1535
1536intptr_t NegativeLookaroundChoiceNode::EatsAtLeast(intptr_t still_to_find,
1537 intptr_t budget,
1538 bool not_at_start) {
1539 if (budget <= 0) return 0;
1540 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1541 // afterwards.
1542 RegExpNode* node = (*alternatives_)[1].node();
1543 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1544}
1545
1547 QuickCheckDetails* details,
1549 intptr_t filled_in,
1550 bool not_at_start) {
1551 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1552 // afterwards.
1553 RegExpNode* node = (*alternatives_)[1].node();
1554 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1555}
1556
1557intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find,
1558 intptr_t budget,
1559 RegExpNode* ignore_this_node,
1560 bool not_at_start) {
1561 if (budget <= 0) return 0;
1562 intptr_t min = 100;
1563 intptr_t choice_count = alternatives_->length();
1564 budget = (budget - 1) / choice_count;
1565 for (intptr_t i = 0; i < choice_count; i++) {
1566 RegExpNode* node = (*alternatives_)[i].node();
1567 if (node == ignore_this_node) continue;
1568 intptr_t node_eats_at_least =
1569 node->EatsAtLeast(still_to_find, budget, not_at_start);
1570 if (node_eats_at_least < min) min = node_eats_at_least;
1571 if (min == 0) return 0;
1572 }
1573 return min;
1574}
1575
1576intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,
1577 intptr_t budget,
1578 bool not_at_start) {
1579 return EatsAtLeastHelper(still_to_find, budget - 1, loop_node_, not_at_start);
1580}
1581
1582intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find,
1583 intptr_t budget,
1584 bool not_at_start) {
1585 return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
1586}
1587
1588// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1589static inline uint32_t SmearBitsRight(uint32_t v) {
1590 v |= v >> 1;
1591 v |= v >> 2;
1592 v |= v >> 4;
1593 v |= v >> 8;
1594 v |= v >> 16;
1595 return v;
1596}
1597
1599 bool found_useful_op = false;
1600 uint32_t char_mask;
1601 if (asc) {
1603 } else {
1604 char_mask = Utf16::kMaxCodeUnit;
1605 }
1606 mask_ = 0;
1607 value_ = 0;
1608 intptr_t char_shift = 0;
1609 for (intptr_t i = 0; i < characters_; i++) {
1610 Position* pos = &positions_[i];
1611 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {
1612 found_useful_op = true;
1613 }
1614 mask_ |= (pos->mask & char_mask) << char_shift;
1615 value_ |= (pos->value & char_mask) << char_shift;
1616 char_shift += asc ? 8 : 16;
1617 }
1618 return found_useful_op;
1619}
1620
1622 Trace* bounds_check_trace,
1623 Trace* trace,
1624 bool preload_has_checked_bounds,
1625 BlockLabel* on_possible_success,
1626 QuickCheckDetails* details,
1627 bool fall_through_on_failure) {
1628 if (details->characters() == 0) return false;
1629 GetQuickCheckDetails(details, compiler, 0,
1630 trace->at_start() == Trace::FALSE_VALUE);
1631 if (details->cannot_match()) return false;
1632 if (!details->Rationalize(compiler->one_byte())) return false;
1633 ASSERT(details->characters() == 1 ||
1634 compiler->macro_assembler()->CanReadUnaligned());
1635 uint32_t mask = details->mask();
1636 uint32_t value = details->value();
1637
1638 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1639
1640 if (trace->characters_preloaded() != details->characters()) {
1641 ASSERT(trace->cp_offset() == bounds_check_trace->cp_offset());
1642 // We are attempting to preload the minimum number of characters
1643 // any choice would eat, so if the bounds check fails, then none of the
1644 // choices can succeed, so we can just immediately backtrack, rather
1645 // than go to the next choice.
1646 assembler->LoadCurrentCharacter(
1647 trace->cp_offset(), bounds_check_trace->backtrack(),
1648 !preload_has_checked_bounds, details->characters());
1649 }
1650
1651 bool need_mask = true;
1652
1653 if (details->characters() == 1) {
1654 // If number of characters preloaded is 1 then we used a byte or 16 bit
1655 // load so the value is already masked down.
1656 uint32_t char_mask;
1657 if (compiler->one_byte()) {
1659 } else {
1660 char_mask = Utf16::kMaxCodeUnit;
1661 }
1662 if ((mask & char_mask) == char_mask) need_mask = false;
1663 mask &= char_mask;
1664 } else {
1665 // For 2-character preloads in one-byte mode or 1-character preloads in
1666 // two-byte mode we also use a 16 bit load with zero extend.
1667 if (details->characters() == 2 && compiler->one_byte()) {
1668 if ((mask & 0xffff) == 0xffff) need_mask = false;
1669 } else if (details->characters() == 1 && !compiler->one_byte()) {
1670 if ((mask & 0xffff) == 0xffff) need_mask = false;
1671 } else {
1672 if (mask == 0xffffffff) need_mask = false;
1673 }
1674 }
1675
1676 if (fall_through_on_failure) {
1677 if (need_mask) {
1678 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1679 } else {
1680 assembler->CheckCharacter(value, on_possible_success);
1681 }
1682 } else {
1683 if (need_mask) {
1684 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1685 } else {
1686 assembler->CheckNotCharacter(value, trace->backtrack());
1687 }
1688 }
1689 return true;
1690}
1691
1692// Here is the meat of GetQuickCheckDetails (see also the comment on the
1693// super-class in the .h file).
1694//
1695// We iterate along the text object, building up for each character a
1696// mask and value that can be used to test for a quick failure to match.
1697// The masks and values for the positions will be combined into a single
1698// machine word for the current character width in order to be used in
1699// generating a quick check.
1702 intptr_t characters_filled_in,
1703 bool not_at_start) {
1704#if defined(__GNUC__) && defined(__BYTE_ORDER__)
1705 // TODO(zerny): Make the combination code byte-order independent.
1706 ASSERT(details->characters() == 1 ||
1707 (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__));
1708#endif
1709 // Do not collect any quick check details if the text node reads backward,
1710 // since it reads in the opposite direction than we use for quick checks.
1711 if (read_backward()) return;
1712 ASSERT(characters_filled_in < details->characters());
1713 intptr_t characters = details->characters();
1714 int32_t char_mask;
1715 if (compiler->one_byte()) {
1717 } else {
1718 char_mask = Utf16::kMaxCodeUnit;
1719 }
1720 for (intptr_t k = 0; k < elms_->length(); k++) {
1721 TextElement elm = elms_->At(k);
1722 if (elm.text_type() == TextElement::ATOM) {
1723 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1724 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {
1726 details->positions(characters_filled_in);
1727 uint16_t c = quarks->At(i);
1728 if (c > char_mask) {
1729 // If we expect a non-Latin1 character from an one-byte string,
1730 // there is no way we can match. Not even case independent
1731 // matching can turn an Latin1 character into non-Latin1 or
1732 // vice versa.
1733 // TODO(dcarney): issue 3550. Verify that this works as expected.
1734 // For example, \u0178 is uppercase of \u00ff (y-umlaut).
1735 details->set_cannot_match();
1736 pos->determines_perfectly = false;
1737 return;
1738 }
1739 if (elm.atom()->ignore_case()) {
1741 intptr_t length =
1742 GetCaseIndependentLetters(c, compiler->one_byte(), chars);
1743 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1744 if (length == 1) {
1745 // This letter has no case equivalents, so it's nice and simple
1746 // and the mask-compare will determine definitely whether we have
1747 // a match at this character position.
1748 pos->mask = char_mask;
1749 pos->value = c;
1750 pos->determines_perfectly = true;
1751 } else {
1752 uint32_t common_bits = char_mask;
1753 uint32_t bits = chars[0];
1754 for (intptr_t j = 1; j < length; j++) {
1755 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1756 common_bits ^= differing_bits;
1757 bits &= common_bits;
1758 }
1759 // If length is 2 and common bits has only one zero in it then
1760 // our mask and compare instruction will determine definitely
1761 // whether we have a match at this character position. Otherwise
1762 // it can only be an approximate check.
1763 uint32_t one_zero = (common_bits | ~char_mask);
1764 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1765 pos->determines_perfectly = true;
1766 }
1767 pos->mask = common_bits;
1768 pos->value = bits;
1769 }
1770 } else {
1771 // Don't ignore case. Nice simple case where the mask-compare will
1772 // determine definitely whether we have a match at this character
1773 // position.
1774 pos->mask = char_mask;
1775 pos->value = c;
1776 pos->determines_perfectly = true;
1777 }
1778 characters_filled_in++;
1779 ASSERT(characters_filled_in <= details->characters());
1780 if (characters_filled_in == details->characters()) {
1781 return;
1782 }
1783 }
1784 } else {
1786 details->positions(characters_filled_in);
1787 RegExpCharacterClass* tree = elm.char_class();
1788 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();
1789 ASSERT(!ranges->is_empty());
1790 if (!CharacterRange::IsCanonical(ranges)) {
1792 }
1793 if (tree->is_negated()) {
1794 // A quick check uses multi-character mask and compare. There is no
1795 // useful way to incorporate a negative char class into this scheme
1796 // so we just conservatively create a mask and value that will always
1797 // succeed.
1798 pos->mask = 0;
1799 pos->value = 0;
1800 } else {
1801 intptr_t first_range = 0;
1802 while (ranges->At(first_range).from() > char_mask) {
1803 first_range++;
1804 if (first_range == ranges->length()) {
1805 details->set_cannot_match();
1806 pos->determines_perfectly = false;
1807 return;
1808 }
1809 }
1810 CharacterRange range = ranges->At(first_range);
1811 uint16_t from = range.from();
1812 uint16_t to = range.to();
1813 if (to > char_mask) {
1814 to = char_mask;
1815 }
1816 uint32_t differing_bits = (from ^ to);
1817 // A mask and compare is only perfect if the differing bits form a
1818 // number like 00011111 with one single block of trailing 1s.
1819 if ((differing_bits & (differing_bits + 1)) == 0 &&
1820 from + differing_bits == to) {
1821 pos->determines_perfectly = true;
1822 }
1823 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1824 uint32_t bits = (from & common_bits);
1825 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {
1826 CharacterRange range = ranges->At(i);
1827 uint16_t from = range.from();
1828 uint16_t to = range.to();
1829 if (from > char_mask) continue;
1830 if (to > char_mask) to = char_mask;
1831 // Here we are combining more ranges into the mask and compare
1832 // value. With each new range the mask becomes more sparse and
1833 // so the chances of a false positive rise. A character class
1834 // with multiple ranges is assumed never to be equivalent to a
1835 // mask and compare operation.
1836 pos->determines_perfectly = false;
1837 uint32_t new_common_bits = (from ^ to);
1838 new_common_bits = ~SmearBitsRight(new_common_bits);
1839 common_bits &= new_common_bits;
1840 bits &= new_common_bits;
1841 uint32_t differing_bits = (from & common_bits) ^ bits;
1842 common_bits ^= differing_bits;
1843 bits &= common_bits;
1844 }
1845 pos->mask = common_bits;
1846 pos->value = bits;
1847 }
1848 characters_filled_in++;
1849 ASSERT(characters_filled_in <= details->characters());
1850 if (characters_filled_in == details->characters()) {
1851 return;
1852 }
1853 }
1854 }
1855 ASSERT(characters_filled_in != details->characters());
1856 if (!details->cannot_match()) {
1857 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1858 true);
1859 }
1860}
1861
1863 for (int i = 0; i < characters_; i++) {
1864 positions_[i].mask = 0;
1865 positions_[i].value = 0;
1866 positions_[i].determines_perfectly = false;
1867 }
1868 characters_ = 0;
1869}
1870
1871void QuickCheckDetails::Advance(intptr_t by, bool one_byte) {
1872 if (by >= characters_ || by < 0) {
1873 // check that by < 0 => characters_ == 0
1874 ASSERT(by >= 0 || characters_ == 0);
1875 Clear();
1876 return;
1877 }
1878 for (intptr_t i = 0; i < characters_ - by; i++) {
1879 positions_[i] = positions_[by + i];
1880 }
1881 for (intptr_t i = characters_ - by; i < characters_; i++) {
1882 positions_[i].mask = 0;
1883 positions_[i].value = 0;
1884 positions_[i].determines_perfectly = false;
1885 }
1886 characters_ -= by;
1887 // We could change mask_ and value_ here but we would never advance unless
1888 // they had already been used in a check and they won't be used again because
1889 // it would gain us nothing. So there's no point.
1890}
1891
1892void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) {
1893 ASSERT(characters_ == other->characters_);
1894 if (other->cannot_match_) {
1895 return;
1896 }
1897 if (cannot_match_) {
1898 *this = *other;
1899 return;
1900 }
1901 for (intptr_t i = from_index; i < characters_; i++) {
1902 QuickCheckDetails::Position* pos = positions(i);
1903 QuickCheckDetails::Position* other_pos = other->positions(i);
1904 if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1905 !other_pos->determines_perfectly) {
1906 // Our mask-compare operation will be approximate unless we have the
1907 // exact same operation on both sides of the alternation.
1908 pos->determines_perfectly = false;
1909 }
1910 pos->mask &= other_pos->mask;
1911 pos->value &= pos->mask;
1912 other_pos->value &= pos->mask;
1913 uint16_t differing_bits = (pos->value ^ other_pos->value);
1914 pos->mask &= ~differing_bits;
1915 pos->value &= pos->mask;
1916 }
1917}
1918
1919class VisitMarker : public ValueObject {
1920 public:
1921 explicit VisitMarker(NodeInfo* info) : info_(info) {
1922 ASSERT(!info->visited);
1923 info->visited = true;
1924 }
1925 ~VisitMarker() { info_->visited = false; }
1926
1927 private:
1928 NodeInfo* info_;
1929};
1930
1932 if (info()->replacement_calculated) return replacement();
1933 if (depth < 0) return this;
1934 ASSERT(!info()->visited);
1936 return FilterSuccessor(depth - 1);
1937}
1938
1940 RegExpNode* next = on_success_->FilterOneByte(depth - 1);
1941 if (next == nullptr) return set_replacement(nullptr);
1942 on_success_ = next;
1943 return set_replacement(this);
1944}
1945
1946// We need to check for the following characters: 0x39c 0x3bc 0x178.
1948 // TODO(dcarney): this could be a lot more efficient.
1949 return range.Contains(0x39c) || range.Contains(0x3bc) ||
1950 range.Contains(0x178);
1951}
1952
1955 for (intptr_t i = 0; i < ranges->length(); i++) {
1956 // TODO(dcarney): this could be a lot more efficient.
1957 if (RangeContainsLatin1Equivalents(ranges->At(i))) return true;
1958 }
1959 return false;
1960}
1961
1962static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) {
1964 switch (c) {
1965 // This are equivalent characters in unicode.
1966 case 0x39c:
1967 case 0x3bc:
1968 return 0xb5;
1969 // This is an uppercase of a Latin-1 character
1970 // outside of Latin-1.
1971 case 0x178:
1972 return 0xff;
1973 }
1974 return 0;
1975}
1976
1978 if (info()->replacement_calculated) return replacement();
1979 if (depth < 0) return this;
1980 ASSERT(!info()->visited);
1982 intptr_t element_count = elms_->length();
1983 for (intptr_t i = 0; i < element_count; i++) {
1984 TextElement elm = elms_->At(i);
1985 if (elm.text_type() == TextElement::ATOM) {
1986 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1987 for (intptr_t j = 0; j < quarks->length(); j++) {
1988 uint16_t c = quarks->At(j);
1989 if (c <= Symbols::kMaxOneCharCodeSymbol) continue;
1990 if (!elm.atom()->ignore_case()) return set_replacement(nullptr);
1991 // Here, we need to check for characters whose upper and lower cases
1992 // are outside the Latin-1 range.
1993 uint16_t converted = ConvertNonLatin1ToLatin1(c);
1994 // Character is outside Latin-1 completely
1995 if (converted == 0) return set_replacement(nullptr);
1996 // Convert quark to Latin-1 in place.
1997 (*quarks)[0] = converted;
1998 }
1999 } else {
2001 RegExpCharacterClass* cc = elm.char_class();
2002 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
2003 if (!CharacterRange::IsCanonical(ranges)) {
2005 }
2006 // Now they are in order so we only need to look at the first.
2007 intptr_t range_count = ranges->length();
2008 if (cc->is_negated()) {
2009 if (range_count != 0 && ranges->At(0).from() == 0 &&
2010 ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) {
2011 // This will be handled in a later filter.
2012 if (cc->flags().IgnoreCase() &&
2014 continue;
2015 }
2016 return set_replacement(nullptr);
2017 }
2018 } else {
2019 if (range_count == 0 ||
2020 ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) {
2021 // This will be handled in a later filter.
2022 if (cc->flags().IgnoreCase() &&
2024 continue;
2025 return set_replacement(nullptr);
2026 }
2027 }
2028 }
2029 }
2030 return FilterSuccessor(depth - 1);
2031}
2032
2034 if (info()->replacement_calculated) return replacement();
2035 if (depth < 0) return this;
2036 if (info()->visited) return this;
2037 {
2039
2040 RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2041 // If we can't continue after the loop then there is no sense in doing the
2042 // loop.
2043 if (continue_replacement == nullptr) return set_replacement(nullptr);
2044 }
2045
2046 return ChoiceNode::FilterOneByte(depth - 1);
2047}
2048
2050 if (info()->replacement_calculated) return replacement();
2051 if (depth < 0) return this;
2052 if (info()->visited) return this;
2054 intptr_t choice_count = alternatives_->length();
2055
2056 for (intptr_t i = 0; i < choice_count; i++) {
2057 GuardedAlternative alternative = alternatives_->At(i);
2058 if (alternative.guards() != nullptr &&
2059 alternative.guards()->length() != 0) {
2060 set_replacement(this);
2061 return this;
2062 }
2063 }
2064
2065 intptr_t surviving = 0;
2066 RegExpNode* survivor = nullptr;
2067 for (intptr_t i = 0; i < choice_count; i++) {
2068 GuardedAlternative alternative = alternatives_->At(i);
2069 RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2070 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2071 if (replacement != nullptr) {
2072 (*alternatives_)[i].set_node(replacement);
2073 surviving++;
2074 survivor = replacement;
2075 }
2076 }
2077 if (surviving < 2) return set_replacement(survivor);
2078
2079 set_replacement(this);
2080 if (surviving == choice_count) {
2081 return this;
2082 }
2083 // Only some of the nodes survived the filtering. We need to rebuild the
2084 // alternatives list.
2085 ZoneGrowableArray<GuardedAlternative>* new_alternatives =
2087 for (intptr_t i = 0; i < choice_count; i++) {
2088 RegExpNode* replacement =
2089 (*alternatives_)[i].node()->FilterOneByte(depth - 1);
2090 if (replacement != nullptr) {
2091 (*alternatives_)[i].set_node(replacement);
2092 new_alternatives->Add((*alternatives_)[i]);
2093 }
2094 }
2095 alternatives_ = new_alternatives;
2096 return this;
2097}
2098
2100 if (info()->replacement_calculated) return replacement();
2101 if (depth < 0) return this;
2102 if (info()->visited) return this;
2104 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2105 // afterwards.
2106 RegExpNode* node = (*alternatives_)[1].node();
2107 RegExpNode* replacement = node->FilterOneByte(depth - 1);
2108 if (replacement == nullptr) return set_replacement(nullptr);
2109 (*alternatives_)[1].set_node(replacement);
2110
2111 RegExpNode* neg_node = (*alternatives_)[0].node();
2112 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2113 // If the negative lookahead is always going to fail then
2114 // we don't need to check it.
2115 if (neg_replacement == nullptr) return set_replacement(replacement);
2116 (*alternatives_)[0].set_node(neg_replacement);
2117 return set_replacement(this);
2118}
2119
2122 intptr_t characters_filled_in,
2123 bool not_at_start) {
2124 if (body_can_be_zero_length_ || info()->visited) return;
2127 characters_filled_in, not_at_start);
2128}
2129
2131 intptr_t budget,
2133 bool not_at_start) {
2134 if (body_can_be_zero_length_ || budget <= 0) {
2135 bm->SetRest(offset);
2136 SaveBMInfo(bm, not_at_start, offset);
2137 return;
2138 }
2139 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2140 SaveBMInfo(bm, not_at_start, offset);
2141}
2142
2145 intptr_t characters_filled_in,
2146 bool not_at_start) {
2147 not_at_start = (not_at_start || not_at_start_);
2148 intptr_t choice_count = alternatives_->length();
2149 ASSERT(choice_count > 0);
2150 (*alternatives_)[0].node()->GetQuickCheckDetails(
2151 details, compiler, characters_filled_in, not_at_start);
2152 for (intptr_t i = 1; i < choice_count; i++) {
2153 QuickCheckDetails new_details(details->characters());
2154 RegExpNode* node = (*alternatives_)[i].node();
2155 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2156 not_at_start);
2157 // Here we merge the quick match details of the two branches.
2158 details->Merge(&new_details, characters_filled_in);
2159 }
2160}
2161
2162// Check for [0-9A-Z_a-z].
2165 BlockLabel* non_word,
2166 bool fall_through_on_word) {
2167 if (assembler->CheckSpecialCharacterClass(
2168 fall_through_on_word ? 'w' : 'W',
2169 fall_through_on_word ? non_word : word)) {
2170 // Optimized implementation available.
2171 return;
2172 }
2173 assembler->CheckCharacterGT('z', non_word);
2174 assembler->CheckCharacterLT('0', non_word);
2175 assembler->CheckCharacterGT('a' - 1, word);
2176 assembler->CheckCharacterLT('9' + 1, word);
2177 assembler->CheckCharacterLT('A', non_word);
2178 assembler->CheckCharacterLT('Z' + 1, word);
2179 if (fall_through_on_word) {
2180 assembler->CheckNotCharacter('_', non_word);
2181 } else {
2182 assembler->CheckCharacter('_', word);
2183 }
2184}
2185
2186// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2187// that matches newline or the start of input).
2189 RegExpNode* on_success,
2190 Trace* trace) {
2191 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2192 // We will be loading the previous character into the current character
2193 // register.
2194 Trace new_trace(*trace);
2195 new_trace.InvalidateCurrentCharacter();
2196
2197 BlockLabel ok;
2198 if (new_trace.cp_offset() == 0) {
2199 // The start of input counts as a newline in this context, so skip to
2200 // ok if we are at the start.
2201 assembler->CheckAtStart(&ok);
2202 }
2203 // We already checked that we are not at the start of input so it must be
2204 // OK to load the previous character.
2205 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2206 new_trace.backtrack(), false);
2207 if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
2208 // Newline means \n, \r, 0x2028 or 0x2029.
2209 if (!compiler->one_byte()) {
2210 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2211 }
2212 assembler->CheckCharacter('\n', &ok);
2213 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2214 }
2215 assembler->BindBlock(&ok);
2216 on_success->Emit(compiler, &new_trace);
2217}
2218
2219// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2220void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2221 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2222 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2223 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2224 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2225 if (lookahead == nullptr) {
2226 intptr_t eats_at_least =
2228 EatsAtLeast(kMaxLookaheadForBoyerMoore, kRecursionBudget,
2229 not_at_start));
2230 if (eats_at_least >= 1) {
2231 BoyerMooreLookahead* bm =
2232 new (Z) BoyerMooreLookahead(eats_at_least, compiler, Z);
2233 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
2234 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2235 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2236 }
2237 } else {
2238 if (lookahead->at(0)->is_non_word())
2239 next_is_word_character = Trace::FALSE_VALUE;
2240 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2241 }
2242 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2243 if (next_is_word_character == Trace::UNKNOWN) {
2244 BlockLabel before_non_word;
2245 BlockLabel before_word;
2246 if (trace->characters_preloaded() != 1) {
2247 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2248 }
2249 // Fall through on non-word.
2250 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2251 // Next character is not a word character.
2252 assembler->BindBlock(&before_non_word);
2253 BlockLabel ok;
2254 // Backtrack on \B (non-boundary check) if previous is a word,
2255 // since we know next *is not* a word and this would be a boundary.
2256 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2257
2258 if (!assembler->IsClosed()) {
2259 assembler->GoTo(&ok);
2260 }
2261
2262 assembler->BindBlock(&before_word);
2263 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2264 assembler->BindBlock(&ok);
2265 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2266 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2267 } else {
2268 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
2269 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2270 }
2271}
2272
2273void AssertionNode::BacktrackIfPrevious(
2274 RegExpCompiler* compiler,
2275 Trace* trace,
2276 AssertionNode::IfPrevious backtrack_if_previous) {
2277 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2278 Trace new_trace(*trace);
2279 new_trace.InvalidateCurrentCharacter();
2280
2281 BlockLabel fall_through, dummy;
2282
2283 BlockLabel* non_word = backtrack_if_previous == kIsNonWord
2284 ? new_trace.backtrack()
2285 : &fall_through;
2286 BlockLabel* word = backtrack_if_previous == kIsNonWord
2287 ? &fall_through
2288 : new_trace.backtrack();
2289
2290 if (new_trace.cp_offset() == 0) {
2291 // The start of input counts as a non-word character, so the question is
2292 // decided if we are at the start.
2293 assembler->CheckAtStart(non_word);
2294 }
2295 // We already checked that we are not at the start of input so it must be
2296 // OK to load the previous character.
2297 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
2298 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2299
2300 assembler->BindBlock(&fall_through);
2301 on_success()->Emit(compiler, &new_trace);
2302}
2303
2306 intptr_t filled_in,
2307 bool not_at_start) {
2308 if (assertion_type_ == AT_START && not_at_start) {
2309 details->set_cannot_match();
2310 return;
2311 }
2312 return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2313 not_at_start);
2314}
2315
2317 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2318 switch (assertion_type_) {
2319 case AT_END: {
2320 BlockLabel ok;
2321 assembler->CheckPosition(trace->cp_offset(), &ok);
2322 assembler->GoTo(trace->backtrack());
2323 assembler->BindBlock(&ok);
2324 break;
2325 }
2326 case AT_START: {
2327 if (trace->at_start() == Trace::FALSE_VALUE) {
2328 assembler->GoTo(trace->backtrack());
2329 return;
2330 }
2331 if (trace->at_start() == Trace::UNKNOWN) {
2332 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2333 Trace at_start_trace = *trace;
2334 at_start_trace.set_at_start(Trace::TRUE_VALUE);
2335 on_success()->Emit(compiler, &at_start_trace);
2336 return;
2337 }
2338 } break;
2339 case AFTER_NEWLINE:
2340 EmitHat(compiler, on_success(), trace);
2341 return;
2342 case AT_BOUNDARY:
2343 case AT_NON_BOUNDARY: {
2344 EmitBoundaryCheck(compiler, trace);
2345 return;
2346 }
2347 }
2348 on_success()->Emit(compiler, trace);
2349}
2350
2351static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {
2352 if (quick_check == nullptr) return false;
2353 if (offset >= quick_check->characters()) return false;
2354 return quick_check->positions(offset)->determines_perfectly;
2355}
2356
2357static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {
2358 if (index > *checked_up_to) {
2359 *checked_up_to = index;
2360 }
2361}
2362
2363// We call this repeatedly to generate code for each pass over the text node.
2364// The passes are in increasing order of difficulty because we hope one
2365// of the first passes will fail in which case we are saved the work of the
2366// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2367// we will check the '%' in the first pass, the case independent 'a' in the
2368// second pass and the character class in the last pass.
2369//
2370// The passes are done from right to left, so for example to test for /bar/
2371// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2372// and then a 'b' with offset 0. This means we can avoid the end-of-input
2373// bounds check most of the time. In the example we only need to check for
2374// end-of-input when loading the putative 'r'.
2375//
2376// A slight complication involves the fact that the first character may already
2377// be fetched into a register by the previous node. In this case we want to
2378// do the test for that character first. We do this in separate passes. The
2379// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2380// pass has been performed then subsequent passes will have true in
2381// first_element_checked to indicate that character does not need to be
2382// checked again.
2383//
2384// In addition to all this we are passed a Trace, which can
2385// contain an AlternativeGeneration object. In this AlternativeGeneration
2386// object we can see details of any quick check that was already passed in
2387// order to get to the code we are now generating. The quick check can involve
2388// loading characters, which means we do not need to recheck the bounds
2389// up to the limit the quick check already checked. In addition the quick
2390// check can have involved a mask and compare operation which may simplify
2391// or obviate the need for further checks at some character positions.
2392void TextNode::TextEmitPass(RegExpCompiler* compiler,
2393 TextEmitPassType pass,
2394 bool preloaded,
2395 Trace* trace,
2396 bool first_element_checked,
2397 intptr_t* checked_up_to) {
2398 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2399 bool one_byte = compiler->one_byte();
2400 BlockLabel* backtrack = trace->backtrack();
2401 QuickCheckDetails* quick_check = trace->quick_check_performed();
2402 intptr_t element_count = elms_->length();
2403 intptr_t backward_offset = read_backward() ? -Length() : 0;
2404 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2405 TextElement elm = elms_->At(i);
2406 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2407 if (elm.text_type() == TextElement::ATOM) {
2408 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2409 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {
2410 if (SkipPass(pass, elm.atom()->ignore_case())) continue;
2411 if (first_element_checked && i == 0 && j == 0) continue;
2412 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2413 EmitCharacterFunction* emit_function = nullptr;
2414 uint16_t quark = quarks->At(j);
2415 if (elm.atom()->ignore_case()) {
2416 // Everywhere else we assume that a non-Latin-1 character cannot match
2417 // a Latin-1 character. Avoid the cases where this is assumption is
2418 // invalid by using the Latin1 equivalent instead.
2419 quark = Latin1::TryConvertToLatin1(quark);
2420 }
2421 switch (pass) {
2422 case NON_LATIN1_MATCH:
2423 ASSERT(one_byte);
2424 if (quark > Symbols::kMaxOneCharCodeSymbol) {
2425 assembler->GoTo(backtrack);
2426 return;
2427 }
2428 break;
2429 case NON_LETTER_CHARACTER_MATCH:
2430 emit_function = &EmitAtomNonLetter;
2431 break;
2432 case SIMPLE_CHARACTER_MATCH:
2433 emit_function = &EmitSimpleCharacter;
2434 break;
2435 case CASE_CHARACTER_MATCH:
2436 emit_function = &EmitAtomLetter;
2437 break;
2438 default:
2439 break;
2440 }
2441 if (emit_function != nullptr) {
2442 const bool bounds_check =
2443 *checked_up_to < (cp_offset + j) || read_backward();
2444 bool bound_checked =
2445 emit_function(Z, compiler, quarks->At(j), backtrack,
2446 cp_offset + j, bounds_check, preloaded);
2447 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2448 }
2449 }
2450 } else {
2451 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2452 if (pass == CHARACTER_CLASS_MATCH) {
2453 if (first_element_checked && i == 0) continue;
2454 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2455 RegExpCharacterClass* cc = elm.char_class();
2456 bool bounds_check = *checked_up_to < cp_offset || read_backward();
2457 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2458 bounds_check, preloaded, Z);
2459 UpdateBoundsCheck(cp_offset, checked_up_to);
2460 }
2461 }
2462 }
2463}
2464
2465intptr_t TextNode::Length() {
2466 TextElement elm = elms_->Last();
2467 ASSERT(elm.cp_offset() >= 0);
2468 return elm.cp_offset() + elm.length();
2469}
2470
2471bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) {
2472 TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass);
2473 if (ignore_case) {
2474 return pass == SIMPLE_CHARACTER_MATCH;
2475 } else {
2476 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2477 }
2478}
2479
2482 bool read_backward,
2483 RegExpNode* on_success,
2485 ASSERT(ranges != nullptr);
2488 return new TextNode(elms, read_backward, on_success);
2489}
2490
2492 CharacterRange trail,
2493 bool read_backward,
2494 RegExpNode* on_success,
2496 auto lead_ranges = CharacterRange::List(on_success->zone(), lead);
2497 auto trail_ranges = CharacterRange::List(on_success->zone(), trail);
2498 auto elms = new ZoneGrowableArray<TextElement>(2);
2499
2500 elms->Add(
2502 elms->Add(
2504
2505 return new TextNode(elms, read_backward, on_success);
2506}
2507
2508// This generates the code to match a text node. A text node can contain
2509// straight character sequences (possibly to be matched in a case-independent
2510// way) and character classes. For efficiency we do not do this in a single
2511// pass from left to right. Instead we pass over the text node several times,
2512// emitting code for some character positions every time. See the comment on
2513// TextEmitPass for details.
2515 LimitResult limit_result = LimitVersions(compiler, trace);
2516 if (limit_result == DONE) return;
2517 ASSERT(limit_result == CONTINUE);
2518
2519 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2520 compiler->SetRegExpTooBig();
2521 return;
2522 }
2523
2524 if (compiler->one_byte()) {
2525 intptr_t dummy = 0;
2526 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2527 }
2528
2529 bool first_elt_done = false;
2530 intptr_t bound_checked_to = trace->cp_offset() - 1;
2531 bound_checked_to += trace->bound_checked_up_to();
2532
2533 // If a character is preloaded into the current character register then
2534 // check that now.
2535 if (trace->characters_preloaded() == 1) {
2536 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2537 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2538 false, &bound_checked_to);
2539 }
2540 first_elt_done = true;
2541 }
2542
2543 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2544 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2545 first_elt_done, &bound_checked_to);
2546 }
2547
2548 Trace successor_trace(*trace);
2549 // If we advance backward, we may end up at the start.
2550 successor_trace.AdvanceCurrentPositionInTrace(
2551 read_backward() ? -Length() : Length(), compiler);
2552 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2555 on_success()->Emit(compiler, &successor_trace);
2556}
2557
2559 characters_preloaded_ = 0;
2560}
2561
2564 // We don't have an instruction for shifting the current character register
2565 // down or for using a shifted value for anything so lets just forget that
2566 // we preloaded any characters into it.
2567 characters_preloaded_ = 0;
2568 // Adjust the offsets of the quick check performed information. This
2569 // information is used to find out what we already determined about the
2570 // characters by means of mask and compare.
2571 quick_check_performed_.Advance(by, compiler->one_byte());
2572 cp_offset_ += by;
2573 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2574 compiler->SetRegExpTooBig();
2575 cp_offset_ = 0;
2576 }
2577 bound_checked_up_to_ =
2578 Utils::Maximum(static_cast<intptr_t>(0), bound_checked_up_to_ - by);
2579}
2580
2581void TextNode::MakeCaseIndependent(bool is_one_byte) {
2582 intptr_t element_count = elms_->length();
2583 for (intptr_t i = 0; i < element_count; i++) {
2584 TextElement elm = elms_->At(i);
2585 if (elm.text_type() == TextElement::CHAR_CLASS) {
2586 RegExpCharacterClass* cc = elm.char_class();
2587 bool case_equivalents_already_added =
2588 cc->flags().NeedsUnicodeCaseEquivalents();
2589 if (cc->flags().IgnoreCase() && !case_equivalents_already_added) {
2590 // None of the standard character classes is different in the case
2591 // independent case and it slows us down if we don't know that.
2592 if (cc->is_standard()) continue;
2593 CharacterRange::AddCaseEquivalents(cc->ranges(), is_one_byte, Z);
2594 }
2595 }
2596 }
2597}
2598
2600 TextElement elm = elms_->At(elms_->length() - 1);
2601 return elm.cp_offset() + elm.length();
2602}
2603
2606 if (read_backward()) return nullptr;
2607 if (elms_->length() != 1) return nullptr;
2608 TextElement elm = elms_->At(0);
2609 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2610 RegExpCharacterClass* node = elm.char_class();
2611 ZoneGrowableArray<CharacterRange>* ranges = node->ranges();
2612 if (!CharacterRange::IsCanonical(ranges)) {
2614 }
2615 if (node->is_negated()) {
2616 return ranges->length() == 0 ? on_success() : nullptr;
2617 }
2618 if (ranges->length() != 1) return nullptr;
2619 uint32_t max_char;
2620 if (compiler->one_byte()) {
2622 } else {
2623 max_char = Utf16::kMaxCodeUnit;
2624 }
2625 return ranges->At(0).IsEverything(max_char) ? on_success() : nullptr;
2626}
2627
2628// Finds the fixed match length of a sequence of nodes that goes from
2629// this alternative and back to this choice node. If there are variable
2630// length nodes or other complications in the way then return a sentinel
2631// value indicating that a greedy loop cannot be constructed.
2633 const GuardedAlternative* alternative) {
2634 intptr_t length = 0;
2635 RegExpNode* node = alternative->node();
2636 // Later we will generate code for all these text nodes using recursion
2637 // so we have to limit the max number.
2638 intptr_t recursion_depth = 0;
2639 while (node != this) {
2640 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2641 return kNodeIsTooComplexForGreedyLoops;
2642 }
2643 intptr_t node_length = node->GreedyLoopTextLength();
2644 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2645 return kNodeIsTooComplexForGreedyLoops;
2646 }
2647 length += node_length;
2648 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2649 node = seq_node->on_success();
2650 }
2651 return read_backward() ? -length : length;
2652}
2653
2655 ASSERT(loop_node_ == nullptr);
2656 AddAlternative(alt);
2657 loop_node_ = alt.node();
2658}
2659
2661 ASSERT(continue_node_ == nullptr);
2662 AddAlternative(alt);
2663 continue_node_ = alt.node();
2664}
2665
2667 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2668 if (trace->stop_node() == this) {
2669 // Back edge of greedy optimized loop node graph.
2670 intptr_t text_length =
2671 GreedyLoopTextLengthForAlternative(&alternatives_->At(0));
2672 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2673 // Update the counter-based backtracking info on the stack. This is an
2674 // optimization for greedy loops (see below).
2675 ASSERT(trace->cp_offset() == text_length);
2676 macro_assembler->AdvanceCurrentPosition(text_length);
2677 macro_assembler->GoTo(trace->loop_label());
2678 return;
2679 }
2680 ASSERT(trace->stop_node() == nullptr);
2681 if (!trace->is_trivial()) {
2682 trace->Flush(compiler, this);
2683 return;
2684 }
2685 ChoiceNode::Emit(compiler, trace);
2686}
2687
2688intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2689 intptr_t eats_at_least) {
2690 intptr_t preload_characters =
2691 Utils::Minimum(static_cast<intptr_t>(4), eats_at_least);
2692 if (compiler->one_byte()) {
2693#if !defined(DART_COMPRESSED_POINTERS) && !defined(TARGET_ARCH_RISCV32)
2694 if (preload_characters > 4) preload_characters = 4;
2695 // We can't preload 3 characters because there is no machine instruction
2696 // to do that. We can't just load 4 because we could be reading
2697 // beyond the end of the string, which could cause a memory fault.
2698 if (preload_characters == 3) preload_characters = 2;
2699#else
2700 // Ensure LoadCodeUnitsInstr can always produce a Smi. See
2701 // https://github.com/dart-lang/sdk/issues/29951
2702 if (preload_characters > 2) preload_characters = 2;
2703#endif
2704 } else {
2705#if !defined(DART_COMPRESSED_POINTERS) && !defined(TARGET_ARCH_RISCV32)
2706 if (preload_characters > 2) preload_characters = 2;
2707#else
2708 // Ensure LoadCodeUnitsInstr can always produce a Smi. See
2709 // https://github.com/dart-lang/sdk/issues/29951
2710 if (preload_characters > 1) preload_characters = 1;
2711#endif
2712 }
2713 if (!compiler->macro_assembler()->CanReadUnaligned()) {
2714 if (preload_characters > 1) preload_characters = 1;
2715 }
2716 return preload_characters;
2717}
2718
2719// This structure is used when generating the alternatives in a choice node. It
2720// records the way the alternative is being code generated.
2723 : possible_success(),
2724 expects_preload(false),
2725 after(),
2726 quick_check_details() {}
2731};
2732
2733// Creates a list of AlternativeGenerations. If the list has a reasonable
2734// size then it is on the stack, otherwise the excess is on the heap.
2736 public:
2737 explicit AlternativeGenerationList(intptr_t count) : count_(count) {
2738 ASSERT(count >= 0);
2739 if (count > kAFew) {
2740 excess_alt_gens_.reset(new AlternativeGeneration[count - kAFew]);
2741 }
2742 }
2743
2745 ASSERT(0 <= i);
2746 ASSERT(i < count_);
2747 if (i < kAFew) {
2748 return &a_few_alt_gens_[i];
2749 }
2750 return &excess_alt_gens_[i - kAFew];
2751 }
2752
2753 private:
2754 static constexpr intptr_t kAFew = 10;
2755
2756 intptr_t count_;
2757 AlternativeGeneration a_few_alt_gens_[kAFew];
2758 std::unique_ptr<AlternativeGeneration[]> excess_alt_gens_;
2759
2762};
2763
2764static constexpr int32_t kRangeEndMarker = Utf::kMaxCodePoint + 1;
2765
2766// The '2' variant is inclusive from and exclusive to.
2767// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
2768// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
2769// 0x180E has been removed from Unicode's Zs category and thus
2770// from ECMAScript's WhiteSpace category as of Unicode 6.3.
2771static constexpr int32_t kSpaceRanges[] = {
2772 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
2773 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
2774 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
2775static constexpr intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
2776static constexpr int32_t kWordRanges[] = {
2777 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
2778static constexpr intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);
2779static constexpr int32_t kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
2780static constexpr intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
2781static constexpr int32_t kSurrogateRanges[] = {0xd800, 0xe000, kRangeEndMarker};
2783static constexpr int32_t kLineTerminatorRanges[] = {
2784 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
2785static constexpr intptr_t kLineTerminatorRangeCount =
2787
2789 SetInterval(Interval(character, character));
2790}
2791
2793 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
2794 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2795 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
2796 surrogate_ =
2797 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
2798 if (interval.to() - interval.from() >= kMapSize - 1) {
2799 if (map_count_ != kMapSize) {
2800 map_count_ = kMapSize;
2801 for (intptr_t i = 0; i < kMapSize; i++)
2802 (*map_)[i] = true;
2803 }
2804 return;
2805 }
2806 for (intptr_t i = interval.from(); i <= interval.to(); i++) {
2807 intptr_t mod_character = (i & kMask);
2808 if (!map_->At(mod_character)) {
2809 map_count_++;
2810 (*map_)[mod_character] = true;
2811 }
2812 if (map_count_ == kMapSize) return;
2813 }
2814}
2815
2817 s_ = w_ = d_ = kLatticeUnknown;
2818 if (map_count_ != kMapSize) {
2819 map_count_ = kMapSize;
2820 for (intptr_t i = 0; i < kMapSize; i++)
2821 (*map_)[i] = true;
2822 }
2823}
2824
2827 Zone* zone)
2828 : length_(length), compiler_(compiler) {
2829 if (compiler->one_byte()) {
2831 } else {
2832 max_char_ = Utf16::kMaxCodeUnit;
2833 }
2835 for (intptr_t i = 0; i < length; i++) {
2836 bitmaps_->Add(new (zone) BoyerMoorePositionInfo(zone));
2837 }
2838}
2839
2840// Find the longest range of lookahead that has the fewest number of different
2841// characters that can occur at a given position. Since we are optimizing two
2842// different parameters at once this is a tradeoff.
2843bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {
2844 intptr_t biggest_points = 0;
2845 // If more than 32 characters out of 128 can occur it is unlikely that we can
2846 // be lucky enough to step forwards much of the time.
2847 const intptr_t kMaxMax = 32;
2848 for (intptr_t max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2849 max_number_of_chars *= 2) {
2850 biggest_points =
2851 FindBestInterval(max_number_of_chars, biggest_points, from, to);
2852 }
2853 if (biggest_points == 0) return false;
2854 return true;
2855}
2856
2857// Find the highest-points range between 0 and length_ where the character
2858// information is not too vague. 'Too vague' means that there are more than
2859// max_number_of_chars that can occur at this position. Calculates the number
2860// of points as the product of width-of-the-range and
2861// probability-of-finding-one-of-the-characters, where the probability is
2862// calculated using the frequency distribution of the sample subject string.
2863intptr_t BoyerMooreLookahead::FindBestInterval(intptr_t max_number_of_chars,
2864 intptr_t old_biggest_points,
2865 intptr_t* from,
2866 intptr_t* to) {
2867 intptr_t biggest_points = old_biggest_points;
2868 static constexpr intptr_t kSize = RegExpMacroAssembler::kTableSize;
2869 for (intptr_t i = 0; i < length_;) {
2870 while (i < length_ && Count(i) > max_number_of_chars)
2871 i++;
2872 if (i == length_) break;
2873 intptr_t remembered_from = i;
2874 bool union_map[kSize];
2875 for (intptr_t j = 0; j < kSize; j++)
2876 union_map[j] = false;
2877 while (i < length_ && Count(i) <= max_number_of_chars) {
2878 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2879 for (intptr_t j = 0; j < kSize; j++)
2880 union_map[j] |= map->at(j);
2881 i++;
2882 }
2883 intptr_t frequency = 0;
2884 for (intptr_t j = 0; j < kSize; j++) {
2885 if (union_map[j]) {
2886 // Add 1 to the frequency to give a small per-character boost for
2887 // the cases where our sampling is not good enough and many
2888 // characters have a frequency of zero. This means the frequency
2889 // can theoretically be up to 2*kSize though we treat it mostly as
2890 // a fraction of kSize.
2891 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2892 }
2893 }
2894 // We use the probability of skipping times the distance we are skipping to
2895 // judge the effectiveness of this. Actually we have a cut-off: By
2896 // dividing by 2 we switch off the skipping if the probability of skipping
2897 // is less than 50%. This is because the multibyte mask-and-compare
2898 // skipping in quickcheck is more likely to do well on this case.
2899 bool in_quickcheck_range =
2900 ((i - remembered_from < 4) ||
2901 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2902 // Called 'probability' but it is only a rough estimate and can actually
2903 // be outside the 0-kSize range.
2904 intptr_t probability =
2905 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2906 intptr_t points = (i - remembered_from) * probability;
2907 if (points > biggest_points) {
2908 *from = remembered_from;
2909 *to = i - 1;
2910 biggest_points = points;
2911 }
2912 }
2913 return biggest_points;
2914}
2915
2916// Take all the characters that will not prevent a successful match if they
2917// occur in the subject string in the range between min_lookahead and
2918// max_lookahead (inclusive) measured from the current position. If the
2919// character at max_lookahead offset is not one of these characters, then we
2920// can safely skip forwards by the number of characters in the range.
2921intptr_t BoyerMooreLookahead::GetSkipTable(
2922 intptr_t min_lookahead,
2923 intptr_t max_lookahead,
2924 const TypedData& boolean_skip_table) {
2925 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2926
2927 const intptr_t kSkipArrayEntry = 0;
2928 const intptr_t kDontSkipArrayEntry = 1;
2929
2930 for (intptr_t i = 0; i < kSize; i++) {
2931 boolean_skip_table.SetUint8(i, kSkipArrayEntry);
2932 }
2933 intptr_t skip = max_lookahead + 1 - min_lookahead;
2934
2935 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2936 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2937 for (intptr_t j = 0; j < kSize; j++) {
2938 if (map->at(j)) {
2939 boolean_skip_table.SetUint8(j, kDontSkipArrayEntry);
2940 }
2941 }
2942 }
2943
2944 return skip;
2945}
2946
2947// See comment above on the implementation of GetSkipTable.
2949 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2950
2951 intptr_t min_lookahead = 0;
2952 intptr_t max_lookahead = 0;
2953
2954 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2955
2956 bool found_single_character = false;
2957 intptr_t single_character = 0;
2958 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2959 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2960 if (map->map_count() > 1 ||
2961 (found_single_character && map->map_count() != 0)) {
2962 found_single_character = false;
2963 break;
2964 }
2965 for (intptr_t j = 0; j < kSize; j++) {
2966 if (map->at(j)) {
2967 found_single_character = true;
2968 single_character = j;
2969 break;
2970 }
2971 }
2972 }
2973
2974 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;
2975
2976 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2977 // The mask-compare can probably handle this better.
2978 return;
2979 }
2980
2981 if (found_single_character) {
2982 BlockLabel cont, again;
2983 masm->BindBlock(&again);
2984 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2985 if (max_char_ > kSize) {
2986 masm->CheckCharacterAfterAnd(single_character,
2988 } else {
2989 masm->CheckCharacter(single_character, &cont);
2990 }
2991 masm->AdvanceCurrentPosition(lookahead_width);
2992 masm->GoTo(&again);
2993 masm->BindBlock(&cont);
2994 return;
2995 }
2996
2997 const TypedData& boolean_skip_table = TypedData::ZoneHandle(
2998 compiler_->zone(),
2999 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
3000 intptr_t skip_distance =
3001 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
3002 ASSERT(skip_distance != 0);
3003
3004 BlockLabel cont, again;
3005
3006 masm->BindBlock(&again);
3007 masm->CheckPreemption(/*is_backtrack=*/false);
3008 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3009 masm->CheckBitInTable(boolean_skip_table, &cont);
3010 masm->AdvanceCurrentPosition(skip_distance);
3011 masm->GoTo(&again);
3012 masm->BindBlock(&cont);
3013
3014 return;
3015}
3016
3017/* Code generation for choice nodes.
3018 *
3019 * We generate quick checks that do a mask and compare to eliminate a
3020 * choice. If the quick check succeeds then it jumps to the continuation to
3021 * do slow checks and check subsequent nodes. If it fails (the common case)
3022 * it falls through to the next choice.
3023 *
3024 * Here is the desired flow graph. Nodes directly below each other imply
3025 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3026 * 3 doesn't have a quick check so we have to call the slow check.
3027 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3028 * regexp continuation is generated directly after the Sn node, up to the
3029 * next GoTo if we decide to reuse some already generated code. Some
3030 * nodes expect preload_characters to be preloaded into the current
3031 * character register. R nodes do this preloading. Vertices are marked
3032 * F for failures and S for success (possible success in the case of quick
3033 * nodes). L, V, < and > are used as arrow heads.
3034 *
3035 * ----------> R
3036 * |
3037 * V
3038 * Q1 -----> S1
3039 * | S /
3040 * F| /
3041 * | F/
3042 * | /
3043 * | R
3044 * | /
3045 * V L
3046 * Q2 -----> S2
3047 * | S /
3048 * F| /
3049 * | F/
3050 * | /
3051 * | R
3052 * | /
3053 * V L
3054 * S3
3055 * |
3056 * F|
3057 * |
3058 * R
3059 * |
3060 * backtrack V
3061 * <----------Q4
3062 * \ F |
3063 * \ |S
3064 * \ F V
3065 * \-----S4
3066 *
3067 * For greedy loops we push the current position, then generate the code that
3068 * eats the input specially in EmitGreedyLoop. The other choice (the
3069 * continuation) is generated by the normal code in EmitChoices, and steps back
3070 * in the input to the starting position when it fails to match. The loop code
3071 * looks like this (U is the unwind code that steps back in the greedy loop).
3072 *
3073 * _____
3074 * / \
3075 * V |
3076 * ----------> S1 |
3077 * /| |
3078 * / |S |
3079 * F/ \_____/
3080 * /
3081 * |<-----
3082 * | \
3083 * V |S
3084 * Q2 ---> U----->backtrack
3085 * | F /
3086 * S| /
3087 * V F /
3088 * S2--/
3089 */
3090
3092 counter_backtrack_trace_.set_backtrack(&label_);
3093 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3094}
3095
3096void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3097#ifdef DEBUG
3098 intptr_t choice_count = alternatives_->length();
3099 for (intptr_t i = 0; i < choice_count - 1; i++) {
3100 GuardedAlternative alternative = alternatives_->At(i);
3101 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3102 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3103 for (intptr_t j = 0; j < guard_count; j++) {
3104 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));
3105 }
3106 }
3107#endif
3108}
3109
3110void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3111 Trace* current_trace,
3112 PreloadState* state) {
3113 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3114 // Save some time by looking at most one machine word ahead.
3115 state->eats_at_least_ =
3116 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3117 current_trace->at_start() == Trace::FALSE_VALUE);
3118 }
3119 state->preload_characters_ =
3120 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3121
3122 state->preload_is_current_ =
3123 (current_trace->characters_preloaded() == state->preload_characters_);
3124 state->preload_has_checked_bounds_ = state->preload_is_current_;
3125}
3126
3128 intptr_t choice_count = alternatives_->length();
3129
3130 if (choice_count == 1 && alternatives_->At(0).guards() == nullptr) {
3131 alternatives_->At(0).node()->Emit(compiler, trace);
3132 return;
3133 }
3134
3135 AssertGuardsMentionRegisters(trace);
3136
3137 LimitResult limit_result = LimitVersions(compiler, trace);
3138 if (limit_result == DONE) return;
3139 ASSERT(limit_result == CONTINUE);
3140
3141 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3142 // other choice nodes we only flush if we are out of code size budget.
3143 if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3144 trace->Flush(compiler, this);
3145 return;
3146 }
3147
3149
3150 PreloadState preload;
3151 preload.init();
3152 GreedyLoopState greedy_loop_state(not_at_start());
3153
3154 intptr_t text_length =
3156 AlternativeGenerationList alt_gens(choice_count);
3157
3158 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3159 trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3160 &greedy_loop_state, text_length);
3161 } else {
3162 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3163 // match the traces produced pre-cleanup.
3164 BlockLabel second_choice;
3165 compiler->macro_assembler()->BindBlock(&second_choice);
3166
3167 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3168
3169 EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3170 }
3171
3172 // At this point we need to generate slow checks for the alternatives where
3173 // the quick check was inlined. We can recognize these because the associated
3174 // label was bound.
3175 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3176 for (intptr_t i = 0; i < choice_count; i++) {
3177 AlternativeGeneration* alt_gen = alt_gens.at(i);
3178 Trace new_trace(*trace);
3179 // If there are actions to be flushed we have to limit how many times
3180 // they are flushed. Take the budget of the parent trace and distribute
3181 // it fairly amongst the children.
3182 if (new_trace.actions() != nullptr) {
3183 new_trace.set_flush_budget(new_flush_budget);
3184 }
3185 bool next_expects_preload =
3186 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3187 EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->At(i),
3188 alt_gen, preload.preload_characters_,
3189 next_expects_preload);
3190 }
3191}
3192
3193Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3194 Trace* trace,
3195 AlternativeGenerationList* alt_gens,
3196 PreloadState* preload,
3197 GreedyLoopState* greedy_loop_state,
3198 intptr_t text_length) {
3199 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3200 // Here we have special handling for greedy loops containing only text nodes
3201 // and other simple nodes. These are handled by pushing the current
3202 // position on the stack and then incrementing the current position each
3203 // time around the switch. On backtrack we decrement the current position
3204 // and check it against the pushed value. This avoids pushing backtrack
3205 // information for each iteration of the loop, which could take up a lot of
3206 // space.
3207 ASSERT(trace->stop_node() == nullptr);
3208 macro_assembler->PushCurrentPosition();
3209 BlockLabel greedy_match_failed;
3210 Trace greedy_match_trace;
3211 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3212 greedy_match_trace.set_backtrack(&greedy_match_failed);
3213 BlockLabel loop_label;
3214 macro_assembler->BindBlock(&loop_label);
3215 macro_assembler->CheckPreemption(/*is_backtrack=*/false);
3216 greedy_match_trace.set_stop_node(this);
3217 greedy_match_trace.set_loop_label(&loop_label);
3218 (*alternatives_)[0].node()->Emit(compiler, &greedy_match_trace);
3219 macro_assembler->BindBlock(&greedy_match_failed);
3220
3221 BlockLabel second_choice; // For use in greedy matches.
3222 macro_assembler->BindBlock(&second_choice);
3223
3224 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3225
3226 EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3227
3228 macro_assembler->BindBlock(greedy_loop_state->label());
3229 // If we have unwound to the bottom then backtrack.
3230 macro_assembler->CheckGreedyLoop(trace->backtrack());
3231 // Otherwise try the second priority at an earlier position.
3232 macro_assembler->AdvanceCurrentPosition(-text_length);
3233 macro_assembler->GoTo(&second_choice);
3234 return new_trace;
3235}
3236
3237intptr_t ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3238 Trace* trace) {
3239 intptr_t eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3240 if (alternatives_->length() != 2) return eats_at_least;
3241
3242 GuardedAlternative alt1 = alternatives_->At(1);
3243 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3244 return eats_at_least;
3245 }
3246 RegExpNode* eats_anything_node = alt1.node();
3247 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3248 return eats_at_least;
3249 }
3250
3251 // Really we should be creating a new trace when we execute this function,
3252 // but there is no need, because the code it generates cannot backtrack, and
3253 // we always arrive here with a trivial trace (since it's the entry to a
3254 // loop. That also implies that there are no preloaded characters, which is
3255 // good, because it means we won't be violating any assumptions by
3256 // overwriting those characters with new load instructions.
3257 ASSERT(trace->is_trivial());
3258
3259 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3260 // At this point we know that we are at a non-greedy loop that will eat
3261 // any character one at a time. Any non-anchored regexp has such a
3262 // loop prepended to it in order to find where it starts. We look for
3263 // a pattern of the form ...abc... where we can look 6 characters ahead
3264 // and step forwards 3 if the character is not one of abc. Abc need
3265 // not be atoms, they can be any reasonably limited character class or
3266 // small alternation.
3267 BoyerMooreLookahead* bm = bm_info(false);
3268 if (bm == nullptr) {
3269 eats_at_least = Utils::Minimum(
3272 if (eats_at_least >= 1) {
3273 bm = new (Z) BoyerMooreLookahead(eats_at_least, compiler, Z);
3274 GuardedAlternative alt0 = alternatives_->At(0);
3275 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);
3276 }
3277 }
3278 if (bm != nullptr) {
3279 bm->EmitSkipInstructions(macro_assembler);
3280 }
3281 return eats_at_least;
3282}
3283
3284void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3285 AlternativeGenerationList* alt_gens,
3286 intptr_t first_choice,
3287 Trace* trace,
3288 PreloadState* preload) {
3289 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3290 SetUpPreLoad(compiler, trace, preload);
3291
3292 // For now we just call all choices one after the other. The idea ultimately
3293 // is to use the Dispatch table to try only the relevant ones.
3294 intptr_t choice_count = alternatives_->length();
3295
3296 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3297
3298 for (intptr_t i = first_choice; i < choice_count; i++) {
3299 bool is_last = i == choice_count - 1;
3300 bool fall_through_on_failure = !is_last;
3301 GuardedAlternative alternative = alternatives_->At(i);
3302 AlternativeGeneration* alt_gen = alt_gens->at(i);
3303 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3304 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3305 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3306 Trace new_trace(*trace);
3307 new_trace.set_characters_preloaded(
3308 preload->preload_is_current_ ? preload->preload_characters_ : 0);
3309 if (preload->preload_has_checked_bounds_) {
3310 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3311 }
3312 new_trace.quick_check_performed()->Clear();
3313 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3314 if (!is_last) {
3315 new_trace.set_backtrack(&alt_gen->after);
3316 }
3317 alt_gen->expects_preload = preload->preload_is_current_;
3318 bool generate_full_check_inline = false;
3319 if (kRegexpOptimization &&
3321 alternative.node()->EmitQuickCheck(
3322 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3323 &alt_gen->possible_success, &alt_gen->quick_check_details,
3324 fall_through_on_failure)) {
3325 // Quick check was generated for this choice.
3326 preload->preload_is_current_ = true;
3327 preload->preload_has_checked_bounds_ = true;
3328 // If we generated the quick check to fall through on possible success,
3329 // we now need to generate the full check inline.
3330 if (!fall_through_on_failure) {
3331 macro_assembler->BindBlock(&alt_gen->possible_success);
3332 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3333 new_trace.set_characters_preloaded(preload->preload_characters_);
3334 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3335 generate_full_check_inline = true;
3336 }
3337 } else if (alt_gen->quick_check_details.cannot_match()) {
3338 if (!fall_through_on_failure) {
3339 macro_assembler->GoTo(trace->backtrack());
3340 }
3341 continue;
3342 } else {
3343 // No quick check was generated. Put the full code here.
3344 // If this is not the first choice then there could be slow checks from
3345 // previous cases that go here when they fail. There's no reason to
3346 // insist that they preload characters since the slow check we are about
3347 // to generate probably can't use it.
3348 if (i != first_choice) {
3349 alt_gen->expects_preload = false;
3350 new_trace.InvalidateCurrentCharacter();
3351 }
3352 generate_full_check_inline = true;
3353 }
3354 if (generate_full_check_inline) {
3355 if (new_trace.actions() != nullptr) {
3356 new_trace.set_flush_budget(new_flush_budget);
3357 }
3358 for (intptr_t j = 0; j < guard_count; j++) {
3359 GenerateGuard(macro_assembler, guards->At(j), &new_trace);
3360 }
3361 alternative.node()->Emit(compiler, &new_trace);
3362 preload->preload_is_current_ = false;
3363 }
3364 macro_assembler->BindBlock(&alt_gen->after);
3365 }
3366}
3367
3368void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3369 Trace* trace,
3370 GuardedAlternative alternative,
3371 AlternativeGeneration* alt_gen,
3372 intptr_t preload_characters,
3373 bool next_expects_preload) {
3374 if (!alt_gen->possible_success.is_linked()) return;
3375
3376 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3377 macro_assembler->BindBlock(&alt_gen->possible_success);
3378 Trace out_of_line_trace(*trace);
3379 out_of_line_trace.set_characters_preloaded(preload_characters);
3380 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3381 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3382 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3383 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3384 if (next_expects_preload) {
3385 BlockLabel reload_current_char;
3386 out_of_line_trace.set_backtrack(&reload_current_char);
3387 for (intptr_t j = 0; j < guard_count; j++) {
3388 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3389 }
3390 alternative.node()->Emit(compiler, &out_of_line_trace);
3391 macro_assembler->BindBlock(&reload_current_char);
3392 // Reload the current character, since the next quick check expects that.
3393 // We don't need to check bounds here because we only get into this
3394 // code through a quick check which already did the checked load.
3395 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
3396 preload_characters);
3397 macro_assembler->GoTo(&(alt_gen->after));
3398 } else {
3399 out_of_line_trace.set_backtrack(&(alt_gen->after));
3400 for (intptr_t j = 0; j < guard_count; j++) {
3401 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3402 }
3403 alternative.node()->Emit(compiler, &out_of_line_trace);
3404 }
3405}
3406
3408 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3409 LimitResult limit_result = LimitVersions(compiler, trace);
3410 if (limit_result == DONE) return;
3411 ASSERT(limit_result == CONTINUE);
3412
3414
3415 switch (action_type_) {
3416 case STORE_POSITION: {
3417 Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3418 data_.u_position_register.is_capture,
3419 trace);
3420 Trace new_trace = *trace;
3421 new_trace.add_action(&new_capture);
3422 on_success()->Emit(compiler, &new_trace);
3423 break;
3424 }
3425 case INCREMENT_REGISTER: {
3427 data_.u_increment_register.reg);
3428 Trace new_trace = *trace;
3429 new_trace.add_action(&new_increment);
3430 on_success()->Emit(compiler, &new_trace);
3431 break;
3432 }
3433 case SET_REGISTER: {
3434 Trace::DeferredSetRegister new_set(data_.u_store_register.reg,
3435 data_.u_store_register.value);
3436 Trace new_trace = *trace;
3437 new_trace.add_action(&new_set);
3438 on_success()->Emit(compiler, &new_trace);
3439 break;
3440 }
3441 case CLEAR_CAPTURES: {
3443 data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3444 Trace new_trace = *trace;
3445 new_trace.add_action(&new_capture);
3446 on_success()->Emit(compiler, &new_trace);
3447 break;
3448 }
3449 case BEGIN_SUBMATCH:
3450 if (!trace->is_trivial()) {
3451 trace->Flush(compiler, this);
3452 } else {
3454 data_.u_submatch.current_position_register, 0);
3455 assembler->WriteStackPointerToRegister(
3456 data_.u_submatch.stack_pointer_register);
3457 on_success()->Emit(compiler, trace);
3458 }
3459 break;
3460 case EMPTY_MATCH_CHECK: {
3461 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;
3462 intptr_t stored_pos = 0;
3463 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;
3464 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3465 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3466 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3467 // If we know we haven't advanced and there is no minimum we
3468 // can just backtrack immediately.
3469 assembler->GoTo(trace->backtrack());
3470 } else if (know_dist && stored_pos < trace->cp_offset()) {
3471 // If we know we've advanced we can generate the continuation
3472 // immediately.
3473 on_success()->Emit(compiler, trace);
3474 } else if (!trace->is_trivial()) {
3475 trace->Flush(compiler, this);
3476 } else {
3477 BlockLabel skip_empty_check;
3478 // If we have a minimum number of repetitions we check the current
3479 // number first and skip the empty check if it's not enough.
3480 if (has_minimum) {
3481 intptr_t limit = data_.u_empty_match_check.repetition_limit;
3482 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3483 }
3484 // If the match is empty we bail out, otherwise we fall through
3485 // to the on-success continuation.
3486 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3487 trace->backtrack());
3488 assembler->BindBlock(&skip_empty_check);
3489 on_success()->Emit(compiler, trace);
3490 }
3491 break;
3492 }
3494 if (!trace->is_trivial()) {
3495 trace->Flush(compiler, this);
3496 return;
3497 }
3499 data_.u_submatch.current_position_register);
3501 data_.u_submatch.stack_pointer_register);
3502 intptr_t clear_register_count = data_.u_submatch.clear_register_count;
3503 if (clear_register_count == 0) {
3504 on_success()->Emit(compiler, trace);
3505 return;
3506 }
3507 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;
3508 BlockLabel clear_registers_backtrack;
3509 Trace new_trace = *trace;
3510 new_trace.set_backtrack(&clear_registers_backtrack);
3511 on_success()->Emit(compiler, &new_trace);
3512
3513 assembler->BindBlock(&clear_registers_backtrack);
3514 intptr_t clear_registers_to =
3515 clear_registers_from + clear_register_count - 1;
3516 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3517
3518 ASSERT(trace->backtrack() == nullptr);
3519 assembler->Backtrack();
3520 return;
3521 }
3522 default:
3523 UNREACHABLE();
3524 }
3525}
3526
3528 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3529 if (!trace->is_trivial()) {
3530 trace->Flush(compiler, this);
3531 return;
3532 }
3533
3534 LimitResult limit_result = LimitVersions(compiler, trace);
3535 if (limit_result == DONE) return;
3536 ASSERT(limit_result == CONTINUE);
3537
3539
3540 ASSERT(start_reg_ + 1 == end_reg_);
3541 if (flags_.IgnoreCase()) {
3543 start_reg_, read_backward(), flags_.IsUnicode(), trace->backtrack());
3544 } else {
3545 assembler->CheckNotBackReference(start_reg_, read_backward(),
3546 trace->backtrack());
3547 }
3548 // We are going to advance backward, so we may end up at the start.
3550
3551 // Check that the back reference does not end inside a surrogate pair.
3552 if (flags_.IsUnicode() && !compiler->one_byte()) {
3553 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3554 }
3555
3556 on_success()->Emit(compiler, trace);
3557}
3558
3559// -------------------------------------------------------------------
3560// Dot/dotty output
3561
3562#ifdef DEBUG
3563
3564class DotPrinter : public NodeVisitor {
3565 public:
3566 explicit DotPrinter(bool ignore_case) {}
3567 void PrintNode(const char* label, RegExpNode* node);
3568 void Visit(RegExpNode* node);
3569 void PrintAttributes(RegExpNode* from);
3570 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3571#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
3573#undef DECLARE_VISIT
3574};
3575
3576void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3577 OS::PrintErr("digraph G {\n graph [label=\"");
3578 for (intptr_t i = 0; label[i] != '\0'; i++) {
3579 switch (label[i]) {
3580 case '\\':
3581 OS::PrintErr("\\\\");
3582 break;
3583 case '"':
3584 OS::PrintErr("\"");
3585 break;
3586 default:
3587 OS::PrintErr("%c", label[i]);
3588 break;
3589 }
3590 }
3591 OS::PrintErr("\"];\n");
3592 Visit(node);
3593 OS::PrintErr("}\n");
3594}
3595
3596void DotPrinter::Visit(RegExpNode* node) {
3597 if (node->info()->visited) return;
3598 node->info()->visited = true;
3599 node->Accept(this);
3600}
3601
3602void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3603 OS::PrintErr(" n%p -> n%p [style=dotted];\n", from, on_failure);
3604 Visit(on_failure);
3605}
3606
3607class AttributePrinter : public ValueObject {
3608 public:
3609 AttributePrinter() : first_(true) {}
3610 void PrintSeparator() {
3611 if (first_) {
3612 first_ = false;
3613 } else {
3614 OS::PrintErr("|");
3615 }
3616 }
3617 void PrintBit(const char* name, bool value) {
3618 if (!value) return;
3619 PrintSeparator();
3620 OS::PrintErr("{%s}", name);
3621 }
3622 void PrintPositive(const char* name, intptr_t value) {
3623 if (value < 0) return;
3624 PrintSeparator();
3625 OS::PrintErr("{%s|%" Pd "}", name, value);
3626 }
3627
3628 private:
3629 bool first_;
3630};
3631
3632void DotPrinter::PrintAttributes(RegExpNode* that) {
3634 " a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3635 "margin=0.1, fontsize=10, label=\"{",
3636 that);
3637 AttributePrinter printer;
3638 NodeInfo* info = that->info();
3639 printer.PrintBit("NI", info->follows_newline_interest);
3640 printer.PrintBit("WI", info->follows_word_interest);
3641 printer.PrintBit("SI", info->follows_start_interest);
3642 BlockLabel* label = that->label();
3643 if (label->is_bound()) printer.PrintPositive("@", label->pos());
3645 "}\"];\n"
3646 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",
3647 that, that);
3648}
3649
3650void DotPrinter::VisitChoice(ChoiceNode* that) {
3651 OS::PrintErr(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3652 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3653 GuardedAlternative alt = that->alternatives()->At(i);
3654 OS::PrintErr(" n%p -> n%p", that, alt.node());
3655 }
3656 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3657 GuardedAlternative alt = that->alternatives()->At(i);
3658 alt.node()->Accept(this);
3659 }
3660}
3661
3662void DotPrinter::VisitText(TextNode* that) {
3663 OS::PrintErr(" n%p [label=\"", that);
3664 for (intptr_t i = 0; i < that->elements()->length(); i++) {
3665 if (i > 0) OS::PrintErr(" ");
3666 TextElement elm = that->elements()->At(i);
3667 switch (elm.text_type()) {
3668 case TextElement::ATOM: {
3669 ZoneGrowableArray<uint16_t>* data = elm.atom()->data();
3670 for (intptr_t i = 0; i < data->length(); i++) {
3671 OS::PrintErr("%c", static_cast<char>(data->At(i)));
3672 }
3673 break;
3674 }
3676 RegExpCharacterClass* node = elm.char_class();
3677 OS::PrintErr("[");
3678 if (node->is_negated()) OS::PrintErr("^");
3679 for (intptr_t j = 0; j < node->ranges()->length(); j++) {
3680 CharacterRange range = node->ranges()->At(j);
3681 PrintUtf16(range.from());
3682 OS::PrintErr("-");
3683 PrintUtf16(range.to());
3684 }
3685 OS::PrintErr("]");
3686 break;
3687 }
3688 default:
3689 UNREACHABLE();
3690 }
3691 }
3692 OS::PrintErr("\", shape=box, peripheries=2];\n");
3693 PrintAttributes(that);
3694 OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3695 Visit(that->on_success());
3696}
3697
3698void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3699 OS::PrintErr(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n",
3700 that, that->start_register(), that->end_register());
3701 PrintAttributes(that);
3702 OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3703 Visit(that->on_success());
3704}
3705
3706void DotPrinter::VisitEnd(EndNode* that) {
3707 OS::PrintErr(" n%p [style=bold, shape=point];\n", that);
3708 PrintAttributes(that);
3709}
3710
3711void DotPrinter::VisitAssertion(AssertionNode* that) {
3712 OS::PrintErr(" n%p [", that);
3713 switch (that->assertion_type()) {
3715 OS::PrintErr("label=\"$\", shape=septagon");
3716 break;
3718 OS::PrintErr("label=\"^\", shape=septagon");
3719 break;
3721 OS::PrintErr("label=\"\\b\", shape=septagon");
3722 break;
3724 OS::PrintErr("label=\"\\B\", shape=septagon");
3725 break;
3727 OS::PrintErr("label=\"(?<=\\n)\", shape=septagon");
3728 break;
3729 }
3730 OS::PrintErr("];\n");
3731 PrintAttributes(that);
3732 RegExpNode* successor = that->on_success();
3733 OS::PrintErr(" n%p -> n%p;\n", that, successor);
3734 Visit(successor);
3735}
3736
3737void DotPrinter::VisitAction(ActionNode* that) {
3738 OS::PrintErr(" n%p [", that);
3739 switch (that->action_type_) {
3741 OS::PrintErr("label=\"$%" Pd ":=%" Pd "\", shape=octagon",
3742 that->data_.u_store_register.reg,
3743 that->data_.u_store_register.value);
3744 break;
3746 OS::PrintErr("label=\"$%" Pd "++\", shape=octagon",
3747 that->data_.u_increment_register.reg);
3748 break;
3750 OS::PrintErr("label=\"$%" Pd ":=$pos\", shape=octagon",
3751 that->data_.u_position_register.reg);
3752 break;
3754 OS::PrintErr("label=\"$%" Pd ":=$pos,begin\", shape=septagon",
3755 that->data_.u_submatch.current_position_register);
3756 break;
3758 OS::PrintErr("label=\"escape\", shape=septagon");
3759 break;
3761 OS::PrintErr("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon",
3762 that->data_.u_empty_match_check.start_register,
3763 that->data_.u_empty_match_check.repetition_register,
3764 that->data_.u_empty_match_check.repetition_limit);
3765 break;
3767 OS::PrintErr("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon",
3768 that->data_.u_clear_captures.range_from,
3769 that->data_.u_clear_captures.range_to);
3770 break;
3771 }
3772 }
3773 OS::PrintErr("];\n");
3774 PrintAttributes(that);
3775 RegExpNode* successor = that->on_success();
3776 OS::PrintErr(" n%p -> n%p;\n", that, successor);
3777 Visit(successor);
3778}
3779
3780void RegExpEngine::DotPrint(const char* label,
3781 RegExpNode* node,
3782 bool ignore_case) {
3783 DotPrinter printer(ignore_case);
3784 printer.PrintNode(label, node);
3785}
3786
3787#endif // DEBUG
3788
3789// -------------------------------------------------------------------
3790// Tree to graph conversion
3791
3792// The zone in which we allocate graph nodes.
3793#define OZ (on_success->zone())
3794
3796 RegExpNode* on_success) {
3799 elms->Add(TextElement::Atom(this));
3800 return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
3801}
3802
3804 RegExpNode* on_success) {
3807 for (intptr_t i = 0; i < elements()->length(); i++) {
3808 elms->Add(elements()->At(i));
3809 }
3810 return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
3811}
3812
3814 const int32_t* special_class,
3815 intptr_t length) {
3816 length--; // Remove final kRangeEndMarker.
3817 ASSERT(special_class[length] == kRangeEndMarker);
3818 ASSERT(ranges->length() != 0);
3819 ASSERT(length != 0);
3820 ASSERT(special_class[0] != 0);
3821 if (ranges->length() != (length >> 1) + 1) {
3822 return false;
3823 }
3824 CharacterRange range = ranges->At(0);
3825 if (range.from() != 0) {
3826 return false;
3827 }
3828 for (intptr_t i = 0; i < length; i += 2) {
3829 if (special_class[i] != (range.to() + 1)) {
3830 return false;
3831 }
3832 range = ranges->At((i >> 1) + 1);
3833 if (special_class[i + 1] != range.from()) {
3834 return false;
3835 }
3836 }
3837 if (range.to() != Utf::kMaxCodePoint) {
3838 return false;
3839 }
3840 return true;
3841}
3842
3844 const int32_t* special_class,
3845 intptr_t length) {
3846 length--; // Remove final kRangeEndMarker.
3847 ASSERT(special_class[length] == kRangeEndMarker);
3848 if (ranges->length() * 2 != length) {
3849 return false;
3850 }
3851 for (intptr_t i = 0; i < length; i += 2) {
3852 CharacterRange range = ranges->At(i >> 1);
3853 if (range.from() != special_class[i] ||
3854 range.to() != special_class[i + 1] - 1) {
3855 return false;
3856 }
3857 }
3858 return true;
3859}
3860
3862 // TODO(lrn): Remove need for this function, by not throwing away information
3863 // along the way.
3864 if (is_negated()) {
3865 return false;
3866 }
3867 if (set_.is_standard()) {
3868 return true;
3869 }
3871 set_.set_standard_set_type('s');
3872 return true;
3873 }
3875 set_.set_standard_set_type('S');
3876 return true;
3877 }
3880 set_.set_standard_set_type('.');
3881 return true;
3882 }
3885 set_.set_standard_set_type('n');
3886 return true;
3887 }
3889 set_.set_standard_set_type('w');
3890 return true;
3891 }
3893 set_.set_standard_set_type('W');
3894 return true;
3895 }
3896 return false;
3897}
3898
3900 Zone* zone,
3902 : zone_(zone),
3903 table_(zone),
3904 bmp_(nullptr),
3905 lead_surrogates_(nullptr),
3906 trail_surrogates_(nullptr),
3907 non_bmp_(nullptr) {
3908 // The unicode range splitter categorizes given character ranges into:
3909 // - Code points from the BMP representable by one code unit.
3910 // - Code points outside the BMP that need to be split into surrogate pairs.
3911 // - Lone lead surrogates.
3912 // - Lone trail surrogates.
3913 // Lone surrogates are valid code points, even though no actual characters.
3914 // They require special matching to make sure we do not split surrogate pairs.
3915 // We use the dispatch table to accomplish this. The base range is split up
3916 // by the table by the overlay ranges, and the Call callback is used to
3917 // filter and collect ranges for each category.
3918 for (intptr_t i = 0; i < base->length(); i++) {
3919 table_.AddRange(base->At(i), kBase, zone_);
3920 }
3921 // Add overlay ranges.
3923 kBmpCodePoints, zone_);
3926 kLeadSurrogates, zone_);
3929 kTrailSurrogates, zone_);
3930 table_.AddRange(
3932 kBmpCodePoints, zone_);
3933 table_.AddRange(
3935 kNonBmpCodePoints, zone_);
3936 table_.ForEach(this);
3937}
3938
3940 OutSet* outset = entry.out_set();
3941 if (!outset->Get(kBase)) return;
3943 if (outset->Get(kBmpCodePoints)) {
3944 target = &bmp_;
3945 } else if (outset->Get(kLeadSurrogates)) {
3946 target = &lead_surrogates_;
3947 } else if (outset->Get(kTrailSurrogates)) {
3948 target = &trail_surrogates_;
3949 } else {
3950 ASSERT(outset->Get(kNonBmpCodePoints));
3951 target = &non_bmp_;
3952 }
3953 if (*target == nullptr) {
3954 *target = new (zone_) ZoneGrowableArray<CharacterRange>(2);
3955 }
3956 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()));
3957}
3958
3961 RegExpNode* on_success,
3962 UnicodeRangeSplitter* splitter) {
3963 ZoneGrowableArray<CharacterRange>* bmp = splitter->bmp();
3964 if (bmp == nullptr) return;
3966 bmp, compiler->read_backward(), on_success, RegExpFlags())));
3967}
3968
3971 RegExpNode* on_success,
3972 UnicodeRangeSplitter* splitter) {
3973 ZoneGrowableArray<CharacterRange>* non_bmp = splitter->non_bmp();
3974 if (non_bmp == nullptr) return;
3975 ASSERT(!compiler->one_byte());
3977 for (int i = 0; i < non_bmp->length(); i++) {
3978 // Match surrogate pair.
3979 // E.g. [\u10005-\u11005] becomes
3980 // \ud800[\udc05-\udfff]|
3981 // [\ud801-\ud803][\udc00-\udfff]|
3982 // \ud804[\udc00-\udc05]
3983 uint32_t from = non_bmp->At(i).from();
3984 uint32_t to = non_bmp->At(i).to();
3985 uint16_t from_points[2];
3986 Utf16::Encode(from, from_points);
3987 uint16_t to_points[2];
3988 Utf16::Encode(to, to_points);
3989 if (from_points[0] == to_points[0]) {
3990 // The lead surrogate is the same.
3991 result->AddAlternative(
3993 CharacterRange::Singleton(from_points[0]),
3994 CharacterRange::Range(from_points[1], to_points[1]),
3995 compiler->read_backward(), on_success, RegExpFlags())));
3996 } else {
3997 if (from_points[1] != Utf16::kTrailSurrogateStart) {
3998 // Add [from_l][from_t-\udfff]
3999 result->AddAlternative(
4001 CharacterRange::Singleton(from_points[0]),
4002 CharacterRange::Range(from_points[1],
4004 compiler->read_backward(), on_success, RegExpFlags())));
4005 from_points[0]++;
4006 }
4007 if (to_points[1] != Utf16::kTrailSurrogateEnd) {
4008 // Add [to_l][\udc00-to_t]
4009 result->AddAlternative(
4011 CharacterRange::Singleton(to_points[0]),
4013 to_points[1]),
4014 compiler->read_backward(), on_success, RegExpFlags())));
4015 to_points[0]--;
4016 }
4017 if (from_points[0] <= to_points[0]) {
4018 // Add [from_l-to_l][\udc00-\udfff]
4019 result->AddAlternative(
4021 CharacterRange::Range(from_points[0], to_points[0]),
4024 compiler->read_backward(), on_success, RegExpFlags())));
4025 }
4026 }
4027 }
4028}
4029
4034 RegExpNode* on_success,
4035 bool read_backward,
4038 match, read_backward, on_success, flags);
4039 int stack_register = compiler->UnicodeLookaroundStackRegister();
4040 int position_register = compiler->UnicodeLookaroundPositionRegister();
4041 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4042 position_register);
4044 lookbehind, !read_backward, lookaround.on_match_success(), flags);
4045 return lookaround.ForMatch(negative_match);
4046}
4047
4052 RegExpNode* on_success,
4053 bool read_backward,
4055 int stack_register = compiler->UnicodeLookaroundStackRegister();
4056 int position_register = compiler->UnicodeLookaroundPositionRegister();
4057 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4058 position_register);
4060 lookahead, read_backward, lookaround.on_match_success(), flags);
4062 match, read_backward, lookaround.ForMatch(negative_match), flags);
4063}
4064
4067 RegExpNode* on_success,
4068 UnicodeRangeSplitter* splitter) {
4069 auto lead_surrogates = splitter->lead_surrogates();
4070 if (lead_surrogates == nullptr) return;
4071 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4072 auto trail_surrogates = CharacterRange::List(
4075
4077 if (compiler->read_backward()) {
4078 // Reading backward. Assert that reading forward, there is no trail
4079 // surrogate, and then backward match the lead surrogate.
4081 compiler, trail_surrogates, lead_surrogates, on_success, true,
4082 RegExpFlags());
4083 } else {
4084 // Reading forward. Forward match the lead surrogate and assert that
4085 // no trail surrogate follows.
4087 compiler, lead_surrogates, trail_surrogates, on_success, false,
4088 RegExpFlags());
4089 }
4090 result->AddAlternative(GuardedAlternative(match));
4091}
4092
4095 RegExpNode* on_success,
4096 UnicodeRangeSplitter* splitter) {
4097 auto trail_surrogates = splitter->trail_surrogates();
4098 if (trail_surrogates == nullptr) return;
4099 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
4100 auto lead_surrogates = CharacterRange::List(
4103
4105 if (compiler->read_backward()) {
4106 // Reading backward. Backward match the trail surrogate and assert that no
4107 // lead surrogate precedes it.
4109 compiler, trail_surrogates, lead_surrogates, on_success, true,
4110 RegExpFlags());
4111 } else {
4112 // Reading forward. Assert that reading backward, there is no lead
4113 // surrogate, and then forward match the trail surrogate.
4115 compiler, lead_surrogates, trail_surrogates, on_success, false,
4116 RegExpFlags());
4117 }
4118 result->AddAlternative(GuardedAlternative(match));
4119}
4120
4122 RegExpNode* on_success) {
4123 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
4124 ASSERT(!compiler->read_backward());
4125 // Advance any character. If the character happens to be a lead surrogate and
4126 // we advanced into the middle of a surrogate pair, it will work out, as
4127 // nothing will match from there. We will have to advance again, consuming
4128 // the associated trail surrogate.
4129 auto range = CharacterRange::List(
4131 return TextNode::CreateForCharacterRanges(range, false, on_success,
4132 RegExpFlags());
4133}
4134
4137
4138 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
4139 // See also https://crbug.com/v8/6727.
4140 // TODO(sstrickl): This only covers the special case of the {0,0x10FFFF}
4141 // range, which we use frequently internally. But large ranges can also easily
4142 // be created by the user. We might want to have a more general caching
4143 // mechanism for such ranges.
4144 if (ranges->length() == 1 && ranges->At(0).IsEverything(Utf::kMaxCodePoint)) {
4145 return;
4146 }
4147
4148 icu::UnicodeSet set;
4149 for (int i = 0; i < ranges->length(); i++) {
4150 set.add(ranges->At(i).from(), ranges->At(i).to());
4151 }
4152 ranges->Clear();
4153 set.closeOver(USET_CASE_INSENSITIVE);
4154 // Full case mapping map single characters to multiple characters.
4155 // Those are represented as strings in the set. Remove them so that
4156 // we end up with only simple and common case mappings.
4157 set.removeAllStrings();
4158 for (int i = 0; i < set.getRangeCount(); i++) {
4159 ranges->Add(
4160 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
4161 }
4162 // No errors and everything we collected have been ranges.
4164}
4165
4167 RegExpNode* on_success) {
4168 set_.Canonicalize();
4170 if (flags_.NeedsUnicodeCaseEquivalents()) {
4172 }
4173 if (flags_.IsUnicode() && !compiler->one_byte() &&
4175 if (is_negated()) {
4179 ranges = negated;
4180 }
4181 if (ranges->length() == 0) {
4184 return new TextNode(fail, compiler->read_backward(), on_success);
4185 }
4186 if (standard_type() == '*') {
4187 return UnanchoredAdvance(compiler, on_success);
4188 } else {
4189 ChoiceNode* result = new (OZ) ChoiceNode(2, OZ);
4190 UnicodeRangeSplitter splitter(OZ, ranges);
4191 AddBmpCharacters(compiler, result, on_success, &splitter);
4192 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
4193 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
4194 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
4195 return result;
4196 }
4197 } else {
4198 return new TextNode(this, compiler->read_backward(), on_success);
4199 }
4200 return new (OZ) TextNode(this, compiler->read_backward(), on_success);
4201}
4202
4204 RegExpNode* on_success) {
4206 intptr_t length = alternatives->length();
4208 for (intptr_t i = 0; i < length; i++) {
4209 GuardedAlternative alternative(
4210 alternatives->At(i)->ToNode(compiler, on_success));
4211 result->AddAlternative(alternative);
4212 }
4213 return result;
4214}
4215
4217 RegExpNode* on_success) {
4218 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
4219}
4220
4221// Scoped object to keep track of how much we unroll quantifier loops in the
4222// regexp graph generator.
4224 public:
4225 static constexpr intptr_t kMaxExpansionFactor = 6;
4227 : compiler_(compiler),
4228 saved_expansion_factor_(compiler->current_expansion_factor()),
4229 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4230 ASSERT(factor > 0);
4231 if (ok_to_expand_) {
4232 if (factor > kMaxExpansionFactor) {
4233 // Avoid integer overflow of the current expansion factor.
4234 ok_to_expand_ = false;
4235 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4236 } else {
4237 intptr_t new_factor = saved_expansion_factor_ * factor;
4238 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4239 compiler->set_current_expansion_factor(new_factor);
4240 }
4241 }
4242 }
4243
4245 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4246 }
4247
4248 bool ok_to_expand() { return ok_to_expand_; }
4249
4250 private:
4251 RegExpCompiler* compiler_;
4252 intptr_t saved_expansion_factor_;
4253 bool ok_to_expand_;
4254
4256};
4257
4259 intptr_t max,
4260 bool is_greedy,
4261 RegExpTree* body,
4263 RegExpNode* on_success,
4264 bool not_at_start) {
4265 // x{f, t} becomes this:
4266 //
4267 // (r++)<-.
4268 // | `
4269 // | (x)
4270 // v ^
4271 // (r=0)-->(?)---/ [if r < t]
4272 // |
4273 // [if r >= f] \----> ...
4274 //
4275
4276 // 15.10.2.5 RepeatMatcher algorithm.
4277 // The parser has already eliminated the case where max is 0. In the case
4278 // where max_match is zero the parser has removed the quantifier if min was
4279 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4280
4281 // If we know that we cannot match zero length then things are a little
4282 // simpler since we don't need to make the special zero length match check
4283 // from step 2.1. If the min and max are small we can unroll a little in
4284 // this case.
4285 // Unroll (foo)+ and (foo){3,}
4286 const intptr_t kMaxUnrolledMinMatches = 3;
4287 // Unroll (foo)? and (foo){x,3}
4288 const intptr_t kMaxUnrolledMaxMatches = 3;
4289 if (max == 0) return on_success; // This can happen due to recursion.
4290 bool body_can_be_empty = (body->min_match() == 0);
4291 intptr_t body_start_reg = RegExpCompiler::kNoRegister;
4292 Interval capture_registers = body->CaptureRegisters();
4293 bool needs_capture_clearing = !capture_registers.is_empty();
4294 Zone* zone = compiler->zone();
4295
4296 if (body_can_be_empty) {
4297 body_start_reg = compiler->AllocateRegister();
4298 } else if (kRegexpOptimization && !needs_capture_clearing) {
4299 // Only unroll if there are no captures and the body can't be
4300 // empty.
4301 {
4302 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
4303 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4304 intptr_t new_max = (max == kInfinity) ? max : max - min;
4305 // Recurse once to get the loop or optional matches after the fixed
4306 // ones.
4307 RegExpNode* answer =
4308 ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
4309 // Unroll the forced matches from 0 to min. This can cause chains of
4310 // TextNodes (which the parser does not generate). These should be
4311 // combined if it turns out they hinder good code generation.
4312 for (intptr_t i = 0; i < min; i++) {
4313 answer = body->ToNode(compiler, answer);
4314 }
4315 return answer;
4316 }
4317 }
4318 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4319 ASSERT(max > 0); // Due to the 'if' above.
4321 if (limiter.ok_to_expand()) {
4322 // Unroll the optional matches up to max.
4323 RegExpNode* answer = on_success;
4324 for (intptr_t i = 0; i < max; i++) {
4325 ChoiceNode* alternation = new (zone) ChoiceNode(2, zone);
4326 if (is_greedy) {
4327 alternation->AddAlternative(
4329 alternation->AddAlternative(GuardedAlternative(on_success));
4330 } else {
4331 alternation->AddAlternative(GuardedAlternative(on_success));
4332 alternation->AddAlternative(
4334 }
4335 answer = alternation;
4336 if (not_at_start && !compiler->read_backward()) {
4337 alternation->set_not_at_start();
4338 }
4339 }
4340 return answer;
4341 }
4342 }
4343 }
4344 bool has_min = min > 0;
4345 bool has_max = max < RegExpTree::kInfinity;
4346 bool needs_counter = has_min || has_max;
4347 intptr_t reg_ctr = needs_counter ? compiler->AllocateRegister()
4349 LoopChoiceNode* center = new (zone)
4350 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
4351 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
4352 RegExpNode* loop_return =
4353 needs_counter ? static_cast<RegExpNode*>(
4355 : static_cast<RegExpNode*>(center);
4356 if (body_can_be_empty) {
4357 // If the body can be empty we need to check if it was and then
4358 // backtrack.
4359 loop_return =
4360 ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
4361 }
4362 RegExpNode* body_node = body->ToNode(compiler, loop_return);
4363 if (body_can_be_empty) {
4364 // If the body can be empty we need to store the start position
4365 // so we can bail out if it was empty.
4366 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
4367 }
4368 if (needs_capture_clearing) {
4369 // Before entering the body of this loop we need to clear captures.
4370 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4371 }
4372 GuardedAlternative body_alt(body_node);
4373 if (has_max) {
4374 Guard* body_guard = new (zone) Guard(reg_ctr, Guard::LT, max);
4375 body_alt.AddGuard(body_guard, zone);
4376 }
4377 GuardedAlternative rest_alt(on_success);
4378 if (has_min) {
4379 Guard* rest_guard = new (zone) Guard(reg_ctr, Guard::GEQ, min);
4380 rest_alt.AddGuard(rest_guard, zone);
4381 }
4382 if (is_greedy) {
4383 center->AddLoopAlternative(body_alt);
4384 center->AddContinueAlternative(rest_alt);
4385 } else {
4386 center->AddContinueAlternative(rest_alt);
4387 center->AddLoopAlternative(body_alt);
4388 }
4389 if (needs_counter) {
4390 return ActionNode::SetRegister(reg_ctr, 0, center);
4391 } else {
4392 return center;
4393 }
4394}
4395
4396namespace {
4397// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
4398// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
4399RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
4400 RegExpNode* on_success,
4403 ASSERT(flags.NeedsUnicodeCaseEquivalents());
4406 CharacterRange::AddClassEscape('w', word_range, true);
4407 int stack_register = compiler->UnicodeLookaroundStackRegister();
4408 int position_register = compiler->UnicodeLookaroundPositionRegister();
4409 ChoiceNode* result = new (OZ) ChoiceNode(2, OZ);
4410 // Add two choices. The (non-)boundary could start with a word or
4411 // a non-word-character.
4412 for (int i = 0; i < 2; i++) {
4413 bool lookbehind_for_word = i == 0;
4414 bool lookahead_for_word =
4415 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
4416 // Look to the left.
4417 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
4418 stack_register, position_register);
4420 word_range, true, lookbehind.on_match_success(), flags);
4421 // Look to the right.
4422 RegExpLookaround::Builder lookahead(lookahead_for_word,
4423 lookbehind.ForMatch(backward),
4424 stack_register, position_register);
4426 word_range, false, lookahead.on_match_success(), flags);
4427 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
4428 }
4429 return result;
4430}
4431} // anonymous namespace
4432
4434 RegExpNode* on_success) {
4435 switch (assertion_type()) {
4436 case START_OF_LINE:
4437 return AssertionNode::AfterNewline(on_success);
4438 case START_OF_INPUT:
4439 return AssertionNode::AtStart(on_success);
4440 case BOUNDARY:
4441 return flags_.NeedsUnicodeCaseEquivalents()
4442 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
4443 flags_)
4444 : AssertionNode::AtBoundary(on_success);
4445 case NON_BOUNDARY:
4446 return flags_.NeedsUnicodeCaseEquivalents()
4447 ? BoundaryAssertionAsLookaround(compiler, on_success,
4448 NON_BOUNDARY, flags_)
4449 : AssertionNode::AtNonBoundary(on_success);
4450 case END_OF_INPUT:
4451 return AssertionNode::AtEnd(on_success);
4452 case END_OF_LINE: {
4453 // Compile $ in multiline regexps as an alternation with a positive
4454 // lookahead in one side and an end-of-input on the other side.
4455 // We need two registers for the lookahead.
4456 intptr_t stack_pointer_register = compiler->AllocateRegister();
4457 intptr_t position_register = compiler->AllocateRegister();
4458 // The ChoiceNode to distinguish between a newline and end-of-input.
4459 ChoiceNode* result = new ChoiceNode(2, on_success->zone());
4460 // Create a newline atom.
4461 ZoneGrowableArray<CharacterRange>* newline_ranges =
4463 CharacterRange::AddClassEscape('n', newline_ranges);
4464 RegExpCharacterClass* newline_atom =
4466 TextNode* newline_matcher =
4467 new TextNode(newline_atom, /*read_backwards=*/false,
4469 stack_pointer_register, position_register,
4470 0, // No captures inside.
4471 -1, // Ignored if no captures.
4472 on_success));
4473 // Create an end-of-input matcher.
4475 stack_pointer_register, position_register, newline_matcher);
4476 // Add the two alternatives to the ChoiceNode.
4477 GuardedAlternative eol_alternative(end_of_line);
4478 result->AddAlternative(eol_alternative);
4479 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4480 result->AddAlternative(end_alternative);
4481 return result;
4482 }
4483 default:
4484 UNREACHABLE();
4485 }
4486 return on_success;
4487}
4488
4490 RegExpNode* on_success) {
4493 compiler->read_backward(), on_success);
4494}
4495
4497 RegExpNode* on_success) {
4498 return on_success;
4499}
4500
4502 RegExpNode* on_success,
4503 intptr_t stack_pointer_register,
4504 intptr_t position_register,
4505 intptr_t capture_register_count,
4506 intptr_t capture_register_start)
4507 : is_positive_(is_positive),
4508 on_success_(on_success),
4509 stack_pointer_register_(stack_pointer_register),
4510 position_register_(position_register) {
4511 if (is_positive_) {
4512 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
4513 stack_pointer_register, position_register, capture_register_count,
4514 capture_register_start, on_success);
4515 } else {
4516 on_match_success_ = new (OZ) NegativeSubmatchSuccess(
4517 stack_pointer_register, position_register, capture_register_count,
4518 capture_register_start, OZ);
4519 }
4520}
4521
4523 if (is_positive_) {
4524 return ActionNode::BeginSubmatch(stack_pointer_register_,
4525 position_register_, match);
4526 } else {
4527 Zone* zone = on_success_->zone();
4528 // We use a ChoiceNode to represent the negative lookaround. The first
4529 // alternative is the negative match. On success, the end node backtracks.
4530 // On failure, the second alternative is tried and leads to success.
4531 // NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the
4532 // first exit when calculating quick checks.
4533 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
4534 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
4535 return ActionNode::BeginSubmatch(stack_pointer_register_,
4536 position_register_, choice_node);
4537 }
4538}
4539
4541 RegExpNode* on_success) {
4542 intptr_t stack_pointer_register = compiler->AllocateRegister();
4543 intptr_t position_register = compiler->AllocateRegister();
4544
4545 const intptr_t registers_per_capture = 2;
4546 const intptr_t register_of_first_capture = 2;
4547 intptr_t register_count = capture_count_ * registers_per_capture;
4548 intptr_t register_start =
4549 register_of_first_capture + capture_from_ * registers_per_capture;
4550
4552 bool was_reading_backward = compiler->read_backward();
4553 compiler->set_read_backward(type() == LOOKBEHIND);
4554 Builder builder(is_positive(), on_success, stack_pointer_register,
4555 position_register, register_count, register_start);
4556 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
4557 result = builder.ForMatch(match);
4558 compiler->set_read_backward(was_reading_backward);
4559 return result;
4560}
4561
4563 RegExpNode* on_success) {
4564 return ToNode(body(), index(), compiler, on_success);
4565}
4566
4568 intptr_t index,
4570 RegExpNode* on_success) {
4571 ASSERT(body != nullptr);
4572 intptr_t start_reg = RegExpCapture::StartRegister(index);
4573 intptr_t end_reg = RegExpCapture::EndRegister(index);
4574 if (compiler->read_backward()) {
4575 intptr_t tmp = end_reg;
4576 end_reg = start_reg;
4577 start_reg = tmp;
4578 }
4579 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
4580 RegExpNode* body_node = body->ToNode(compiler, store_end);
4581 return ActionNode::StorePosition(start_reg, true, body_node);
4582}
4583
4585 RegExpNode* on_success) {
4586 ZoneGrowableArray<RegExpTree*>* children = nodes();
4587 RegExpNode* current = on_success;
4588 if (compiler->read_backward()) {
4589 for (intptr_t i = 0; i < children->length(); i++) {
4590 current = children->At(i)->ToNode(compiler, current);
4591 }
4592 } else {
4593 for (intptr_t i = children->length() - 1; i >= 0; i--) {
4594 current = children->At(i)->ToNode(compiler, current);
4595 }
4596 }
4597 return current;
4598}
4599
4600static void AddClass(const int32_t* elmv,
4601 intptr_t elmc,
4603 elmc--;
4604 ASSERT(elmv[elmc] == kRangeEndMarker);
4605 for (intptr_t i = 0; i < elmc; i += 2) {
4606 ASSERT(elmv[i] < elmv[i + 1]);
4607 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));
4608 }
4609}
4610
4611static void AddClassNegated(const int32_t* elmv,
4612 intptr_t elmc,
4614 elmc--;
4615 ASSERT(elmv[elmc] == kRangeEndMarker);
4616 ASSERT(elmv[0] != 0x0000);
4617 ASSERT(elmv[elmc - 1] != Utf::kMaxCodePoint);
4618 uint16_t last = 0x0000;
4619 for (intptr_t i = 0; i < elmc; i += 2) {
4620 ASSERT(last <= elmv[i] - 1);
4621 ASSERT(elmv[i] < elmv[i + 1]);
4622 ranges->Add(CharacterRange(last, elmv[i] - 1));
4623 last = elmv[i + 1];
4624 }
4625 ranges->Add(CharacterRange(last, Utf::kMaxCodePoint));
4626}
4627
4630 bool add_unicode_case_equivalents) {
4631 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
4632 // See #sec-runtime-semantics-wordcharacters-abstract-operation
4633 // In case of unicode and ignore_case, we need to create the closure over
4634 // case equivalent characters before negating.
4637 AddClass(kWordRanges, kWordRangeCount, new_ranges);
4638 AddUnicodeCaseEquivalents(new_ranges);
4639 if (type == 'W') {
4642 CharacterRange::Negate(new_ranges, negated);
4643 new_ranges = negated;
4644 }
4645 ranges->AddArray(*new_ranges);
4646 return;
4647 }
4648 AddClassEscape(type, ranges);
4649}
4650
4653 switch (type) {
4654 case 's':
4656 break;
4657 case 'S':
4659 break;
4660 case 'w':
4662 break;
4663 case 'W':
4665 break;
4666 case 'd':
4668 break;
4669 case 'D':
4671 break;
4672 case '.':
4674 break;
4675 // This is not a character range as defined by the spec but a
4676 // convenient shorthand for a character class that matches any
4677 // character.
4678 case '*':
4680 break;
4681 // This is the set of characters matched by the $ and ^ symbols
4682 // in multiline mode.
4683 case 'n':
4685 break;
4686 default:
4687 UNREACHABLE();
4688 }
4689}
4690
4693 bool is_one_byte,
4694 Zone* zone) {
4696 int range_count = ranges->length();
4697 for (intptr_t i = 0; i < range_count; i++) {
4698 CharacterRange range = ranges->At(i);
4699 int32_t bottom = range.from();
4700 if (bottom > Utf16::kMaxCodeUnit) continue;
4701 int32_t top = Utils::Minimum(range.to(), Utf16::kMaxCodeUnit);
4702 // Nothing to be done for surrogates
4703 if (bottom >= Utf16::kLeadSurrogateStart &&
4705 continue;
4706 }
4707 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
4708 if (bottom > Symbols::kMaxOneCharCodeSymbol) continue;
4711 }
4712 }
4713
4717 if (top == bottom) {
4718 // If this is a singleton we just expand the one character.
4719 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars);
4720 for (intptr_t i = 0; i < length; i++) {
4721 int32_t chr = chars[i];
4722 if (chr != bottom) {
4723 ranges->Add(CharacterRange::Singleton(chars[i]));
4724 }
4725 }
4726 } else {
4727 // If this is a range we expand the characters block by block,
4728 // expanding contiguous subranges (blocks) one at a time.
4729 // The approach is as follows. For a given start character we
4730 // look up the remainder of the block that contains it (represented
4731 // by the end point), for instance we find 'z' if the character
4732 // is 'c'. A block is characterized by the property
4733 // that all characters uncanonicalize in the same way, except that
4734 // each entry in the result is incremented by the distance from the first
4735 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A']
4736 // and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4737 // Once we've found the end point we look up its uncanonicalization
4738 // and produce a range for each element. For instance for [c-f]
4739 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
4740 // add a range if it is not already contained in the input, so [c-f]
4741 // will be skipped but [C-F] will be added. If this range is not
4742 // completely contained in a block we do this for all the blocks
4743 // covered by the range (handling characters that is not in a block
4744 // as a "singleton block").
4746 intptr_t pos = bottom;
4747 while (pos <= top) {
4748 intptr_t length = jsregexp_canonrange.get(pos, '\0', range);
4749 int32_t block_end;
4750 if (length == 0) {
4751 block_end = pos;
4752 } else {
4753 ASSERT(length == 1);
4754 block_end = range[0];
4755 }
4756 intptr_t end = (block_end > top) ? top : block_end;
4757 length = jsregexp_uncanonicalize.get(block_end, '\0', range);
4758 for (intptr_t i = 0; i < length; i++) {
4759 int32_t c = range[i];
4760 int32_t range_from = c - (block_end - pos);
4761 int32_t range_to = c - (block_end - end);
4762 if (!(bottom <= range_from && range_to <= top)) {
4763 ranges->Add(CharacterRange(range_from, range_to));
4764 }
4765 }
4766 pos = end + 1;
4767 }
4768 }
4769 }
4770}
4771
4773 ASSERT(ranges != nullptr);
4774 intptr_t n = ranges->length();
4775 if (n <= 1) return true;
4776 intptr_t max = ranges->At(0).to();
4777 for (intptr_t i = 1; i < n; i++) {
4778 CharacterRange next_range = ranges->At(i);
4779 if (next_range.from() <= max + 1) return false;
4780 max = next_range.to();
4781 }
4782 return true;
4783}
4784
4786 if (ranges_ == nullptr) {
4787 ranges_ = new ZoneGrowableArray<CharacterRange>(2);
4788 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4789 }
4790 return ranges_;
4791}
4792
4793// Move a number of elements in a zone array to another position
4794// in the same array. Handles overlapping source and target areas.
4796 intptr_t from,
4797 intptr_t to,
4798 intptr_t count) {
4799 // Ranges are potentially overlapping.
4800 if (from < to) {
4801 for (intptr_t i = count - 1; i >= 0; i--) {
4802 (*list)[to + i] = list->At(from + i);
4803 }
4804 } else {
4805 for (intptr_t i = 0; i < count; i++) {
4806 (*list)[to + i] = list->At(from + i);
4807 }
4808 }
4809}
4810
4813 intptr_t count,
4814 CharacterRange insert) {
4815 // Inserts a range into list[0..count[, which must be sorted
4816 // by from value and non-overlapping and non-adjacent, using at most
4817 // list[0..count] for the result. Returns the number of resulting
4818 // canonicalized ranges. Inserting a range may collapse existing ranges into
4819 // fewer ranges, so the return value can be anything in the range 1..count+1.
4820 int32_t from = insert.from();
4821 int32_t to = insert.to();
4822 intptr_t start_pos = 0;
4823 intptr_t end_pos = count;
4824 for (intptr_t i = count - 1; i >= 0; i--) {
4825 CharacterRange current = list->At(i);
4826 if (current.from() > to + 1) {
4827 end_pos = i;
4828 } else if (current.to() + 1 < from) {
4829 start_pos = i + 1;
4830 break;
4831 }
4832 }
4833
4834 // Inserted range overlaps, or is adjacent to, ranges at positions
4835 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4836 // not affected by the insertion.
4837 // If start_pos == end_pos, the range must be inserted before start_pos.
4838 // if start_pos < end_pos, the entire range from start_pos to end_pos
4839 // must be merged with the insert range.
4840
4841 if (start_pos == end_pos) {
4842 // Insert between existing ranges at position start_pos.
4843 if (start_pos < count) {
4844 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
4845 }
4846 (*list)[start_pos] = insert;
4847 return count + 1;
4848 }
4849 if (start_pos + 1 == end_pos) {
4850 // Replace single existing range at position start_pos.
4851 CharacterRange to_replace = list->At(start_pos);
4852 intptr_t new_from = Utils::Minimum(to_replace.from(), from);
4853 intptr_t new_to = Utils::Maximum(to_replace.to(), to);
4854 (*list)[start_pos] = CharacterRange(new_from, new_to);
4855 return count;
4856 }
4857 // Replace a number of existing ranges from start_pos to end_pos - 1.
4858 // Move the remaining ranges down.
4859
4860 intptr_t new_from = Utils::Minimum(list->At(start_pos).from(), from);
4861 intptr_t new_to = Utils::Maximum(list->At(end_pos - 1).to(), to);
4862 if (end_pos < count) {
4863 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
4864 }
4865 (*list)[start_pos] = CharacterRange(new_from, new_to);
4866 return count - (end_pos - start_pos) + 1;
4867}
4868
4870 // Special/default classes are always considered canonical. The result
4871 // of calling ranges() will be sorted.
4872 if (ranges_ == nullptr) return;
4874}
4875
4877 ZoneGrowableArray<CharacterRange>* character_ranges) {
4878 if (character_ranges->length() <= 1) return;
4879 // Check whether ranges are already canonical (increasing, non-overlapping,
4880 // non-adjacent).
4881 intptr_t n = character_ranges->length();
4882 intptr_t max = character_ranges->At(0).to();
4883 intptr_t i = 1;
4884 while (i < n) {
4885 CharacterRange current = character_ranges->At(i);
4886 if (current.from() <= max + 1) {
4887 break;
4888 }
4889 max = current.to();
4890 i++;
4891 }
4892 // Canonical until the i'th range. If that's all of them, we are done.
4893 if (i == n) return;
4894
4895 // The ranges at index i and forward are not canonicalized. Make them so by
4896 // doing the equivalent of insertion sort (inserting each into the previous
4897 // list, in order).
4898 // Notice that inserting a range can reduce the number of ranges in the
4899 // result due to combining of adjacent and overlapping ranges.
4900 intptr_t read = i; // Range to insert.
4901 intptr_t num_canonical = i; // Length of canonicalized part of list.
4902 do {
4903 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
4904 character_ranges->At(read));
4905 read++;
4906 } while (read < n);
4907 character_ranges->TruncateTo(num_canonical);
4908
4909 ASSERT(CharacterRange::IsCanonical(character_ranges));
4910}
4911
4913 ZoneGrowableArray<CharacterRange>* negated_ranges) {
4915 ASSERT(negated_ranges->length() == 0);
4916 intptr_t range_count = ranges->length();
4917 uint32_t from = 0;
4918 intptr_t i = 0;
4919 if (range_count > 0 && ranges->At(0).from() == 0) {
4920 from = ranges->At(0).to();
4921 i = 1;
4922 }
4923 while (i < range_count) {
4924 CharacterRange range = ranges->At(i);
4925 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4926 from = range.to();
4927 i++;
4928 }
4929 if (from < Utf::kMaxCodePoint) {
4930 negated_ranges->Add(CharacterRange(from + 1, Utf::kMaxCodePoint));
4931 }
4932}
4933
4934// -------------------------------------------------------------------
4935// Splay tree
4936
4937// Workaround for the fact that ZoneGrowableArray does not have contains().
4938static bool ArrayContains(ZoneGrowableArray<unsigned>* array, unsigned value) {
4939 for (intptr_t i = 0; i < array->length(); i++) {
4940 if (array->At(i) == value) {
4941 return true;
4942 }
4943 }
4944 return false;
4945}
4946
4947OutSet* OutSet::Extend(unsigned value, Zone* zone) {
4948 if (Get(value)) return this;
4949 if (successors() != nullptr) {
4950 for (int i = 0; i < successors()->length(); i++) {
4951 OutSet* successor = successors()->At(i);
4952 if (successor->Get(value)) return successor;
4953 }
4954 } else {
4955 successors_ = new (zone) ZoneGrowableArray<OutSet*>(2);
4956 }
4957 OutSet* result = new (zone) OutSet(first_, remaining_);
4958 result->Set(value, zone);
4959 successors()->Add(result);
4960 return result;
4961}
4962
4963void OutSet::Set(unsigned value, Zone* zone) {
4964 if (value < kFirstLimit) {
4965 first_ |= (1 << value);
4966 } else {
4967 if (remaining_ == nullptr)
4968 remaining_ = new (zone) ZoneGrowableArray<unsigned>(1);
4969
4970 bool remaining_contains_value = ArrayContains(remaining_, value);
4971 if (remaining_->is_empty() || !remaining_contains_value) {
4972 remaining_->Add(value);
4973 }
4974 }
4975}
4976
4977bool OutSet::Get(unsigned value) const {
4978 if (value < kFirstLimit) {
4979 return (first_ & (1 << value)) != 0;
4980 } else if (remaining_ == nullptr) {
4981 return false;
4982 } else {
4983 return ArrayContains(remaining_, value);
4984 }
4985}
4986
4988
4990 int32_t value,
4991 Zone* zone) {
4992 CharacterRange current = full_range;
4993 if (tree()->is_empty()) {
4994 // If this is the first range we just insert into the table.
4996 bool inserted = tree()->Insert(current.from(), &loc);
4997 ASSERT(inserted);
4998 USE(inserted);
4999 loc.set_value(
5000 Entry(current.from(), current.to(), empty()->Extend(value, zone)));
5001 return;
5002 }
5003 // First see if there is a range to the left of this one that
5004 // overlaps.
5006 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5007 Entry* entry = &loc.value();
5008 // If we've found a range that overlaps with this one, and it
5009 // starts strictly to the left of this one, we have to fix it
5010 // because the following code only handles ranges that start on
5011 // or after the start point of the range we're adding.
5012 if (entry->from() < current.from() && entry->to() >= current.from()) {
5013 // Snap the overlapping range in half around the start point of
5014 // the range we're adding.
5016 CharacterRange::Range(entry->from(), current.from() - 1);
5017 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
5018 // The left part of the overlapping range doesn't overlap.
5019 // Truncate the whole entry to be just the left part.
5020 entry->set_to(left.to());
5021 // The right part is the one that overlaps. We add this part
5022 // to the map and let the next step deal with merging it with
5023 // the range we're adding.
5025 bool inserted = tree()->Insert(right.from(), &loc);
5026 ASSERT(inserted);
5027 USE(inserted);
5028 loc.set_value(Entry(right.from(), right.to(), entry->out_set()));
5029 }
5030 }
5031 while (current.is_valid()) {
5032 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5033 (loc.value().from() <= current.to()) &&
5034 (loc.value().to() >= current.from())) {
5035 Entry* entry = &loc.value();
5036 // We have overlap. If there is space between the start point of
5037 // the range we're adding and where the overlapping range starts
5038 // then we have to add a range covering just that space.
5039 if (current.from() < entry->from()) {
5041 bool inserted = tree()->Insert(current.from(), &ins);
5042 ASSERT(inserted);
5043 USE(inserted);
5044 ins.set_value(Entry(current.from(), entry->from() - 1,
5045 empty()->Extend(value, zone)));
5046 current.set_from(entry->from());
5047 }
5048 ASSERT(current.from() == entry->from());
5049 // If the overlapping range extends beyond the one we want to add
5050 // we have to snap the right part off and add it separately.
5051 if (entry->to() > current.to()) {
5053 bool inserted = tree()->Insert(current.to() + 1, &ins);
5054 ASSERT(inserted);
5055 USE(inserted);
5056 ins.set_value(Entry(current.to() + 1, entry->to(), entry->out_set()));
5057 entry->set_to(current.to());
5058 }
5059 ASSERT(entry->to() <= current.to());
5060 // The overlapping range is now completely contained by the range
5061 // we're adding so we can just update it and move the start point
5062 // of the range we're adding just past it.
5063 entry->AddValue(value, zone);
5064 ASSERT(entry->to() + 1 > current.from());
5065 current.set_from(entry->to() + 1);
5066 } else {
5067 // There is no overlap so we can just add the range
5069 bool inserted = tree()->Insert(current.from(), &ins);
5070 ASSERT(inserted);
5071 USE(inserted);
5072 ins.set_value(
5073 Entry(current.from(), current.to(), empty()->Extend(value, zone)));
5074 break;
5075 }
5076 }
5077}
5078
5079OutSet* ChoiceTable::Get(int32_t value) {
5081 if (!tree()->FindGreatestLessThan(value, &loc)) return empty();
5082 Entry* entry = &loc.value();
5083 if (value <= entry->to())
5084 return entry->out_set();
5085 else
5086 return empty();
5087}
5088
5089// -------------------------------------------------------------------
5090// Analysis
5091
5093 if (that->info()->been_analyzed || that->info()->being_analyzed) return;
5094 that->info()->being_analyzed = true;
5095 that->Accept(this);
5096 that->info()->being_analyzed = false;
5097 that->info()->been_analyzed = true;
5098}
5099
5100void Analysis::VisitEnd(EndNode* that) {
5101 // nothing to do
5102}
5103
5105 intptr_t element_count = elements()->length();
5106 // Set up the offsets of the elements relative to the start. This is a fixed
5107 // quantity since a TextNode can only contain fixed-width things.
5108 intptr_t cp_offset = 0;
5109 for (intptr_t i = 0; i < element_count; i++) {
5110 TextElement& elm = (*elements())[i];
5111 elm.set_cp_offset(cp_offset);
5112 cp_offset += elm.length();
5113 }
5114}
5115
5116void Analysis::VisitText(TextNode* that) {
5117 that->MakeCaseIndependent(is_one_byte_);
5118 EnsureAnalyzed(that->on_success());
5119 if (!has_failed()) {
5120 that->CalculateOffsets();
5121 }
5122}
5123
5124void Analysis::VisitAction(ActionNode* that) {
5125 RegExpNode* target = that->on_success();
5126 EnsureAnalyzed(target);
5127 if (!has_failed()) {
5128 // If the next node is interested in what it follows then this node
5129 // has to be interested too so it can pass the information on.
5130 that->info()->AddFromFollowing(target->info());
5131 }
5132}
5133
5134void Analysis::VisitChoice(ChoiceNode* that) {
5135 NodeInfo* info = that->info();
5136 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
5137 RegExpNode* node = (*that->alternatives())[i].node();
5138 EnsureAnalyzed(node);
5139 if (has_failed()) return;
5140 // Anything the following nodes need to know has to be known by
5141 // this node also, so it can pass it on.
5142 info->AddFromFollowing(node->info());
5143 }
5144}
5145
5147 NodeInfo* info = that->info();
5148 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
5149 RegExpNode* node = (*that->alternatives())[i].node();
5150 if (node != that->loop_node()) {
5151 EnsureAnalyzed(node);
5152 if (has_failed()) return;
5153 info->AddFromFollowing(node->info());
5154 }
5155 }
5156 // Check the loop last since it may need the value of this node
5157 // to get a correct result.
5158 EnsureAnalyzed(that->loop_node());
5159 if (!has_failed()) {
5160 info->AddFromFollowing(that->loop_node()->info());
5161 }
5162}
5163
5164void Analysis::VisitBackReference(BackReferenceNode* that) {
5165 EnsureAnalyzed(that->on_success());
5166}
5167
5168void Analysis::VisitAssertion(AssertionNode* that) {
5169 EnsureAnalyzed(that->on_success());
5170}
5171
5173 intptr_t budget,
5175 bool not_at_start) {
5176 // Working out the set of characters that a backreference can match is too
5177 // hard, so we just say that any character can match.
5178 bm->SetRest(offset);
5179 SaveBMInfo(bm, not_at_start, offset);
5180}
5181
5184
5186 intptr_t budget,
5188 bool not_at_start) {
5189 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();
5190 budget = (budget - 1) / alts->length();
5191 for (intptr_t i = 0; i < alts->length(); i++) {
5192 GuardedAlternative& alt = (*alts)[i];
5193 if (alt.guards() != nullptr && alt.guards()->length() != 0) {
5194 bm->SetRest(offset); // Give up trying to fill in info.
5195 SaveBMInfo(bm, not_at_start, offset);
5196 return;
5197 }
5198 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5199 }
5200 SaveBMInfo(bm, not_at_start, offset);
5201}
5202
5203void TextNode::FillInBMInfo(intptr_t initial_offset,
5204 intptr_t budget,
5206 bool not_at_start) {
5207 if (initial_offset >= bm->length()) return;
5208 intptr_t offset = initial_offset;
5209 intptr_t max_char = bm->max_char();
5210 for (intptr_t i = 0; i < elements()->length(); i++) {
5211 if (offset >= bm->length()) {
5212 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5213 return;
5214 }
5215 TextElement text = elements()->At(i);
5216 if (text.text_type() == TextElement::ATOM) {
5217 RegExpAtom* atom = text.atom();
5218 for (intptr_t j = 0; j < atom->length(); j++, offset++) {
5219 if (offset >= bm->length()) {
5220 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5221 return;
5222 }
5223 uint16_t character = atom->data()->At(j);
5224 if (atom->flags().IgnoreCase()) {
5228 chars);
5229 for (intptr_t j = 0; j < length; j++) {
5230 bm->Set(offset, chars[j]);
5231 }
5232 } else {
5233 if (character <= max_char) bm->Set(offset, character);
5234 }
5235 }
5236 } else {
5237 ASSERT(text.text_type() == TextElement::CHAR_CLASS);
5238 RegExpCharacterClass* char_class = text.char_class();
5239 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();
5240 if (char_class->is_negated()) {
5241 bm->SetAll(offset);
5242 } else {
5243 for (intptr_t k = 0; k < ranges->length(); k++) {
5244 const CharacterRange& range = ranges->At(k);
5245 if (range.from() > max_char) continue;
5246 intptr_t to =
5247 Utils::Minimum(max_char, static_cast<intptr_t>(range.to()));
5248 bm->SetInterval(offset, Interval(range.from(), to));
5249 }
5250 }
5251 offset++;
5252 }
5253 }
5254 if (offset >= bm->length()) {
5255 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5256 return;
5257 }
5258 on_success()->FillInBMInfo(offset, budget - 1, bm,
5259 true); // Not at start after a text node.
5260 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5261}
5262
5264 RegExpNode* on_success,
5266 // If the regexp matching starts within a surrogate pair, step back
5267 // to the lead surrogate and start matching from there.
5268 ASSERT(!compiler->read_backward());
5269 Zone* zone = compiler->zone();
5270
5271 auto lead_surrogates = CharacterRange::List(
5274 auto trail_surrogates = CharacterRange::List(
5277
5278 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
5279
5280 int stack_register = compiler->UnicodeLookaroundStackRegister();
5281 int position_register = compiler->UnicodeLookaroundPositionRegister();
5283 lead_surrogates, /*read_backward=*/true, on_success, flags);
5284 RegExpLookaround::Builder builder(/*is_positive=*/true, step_back,
5285 stack_register, position_register);
5287 trail_surrogates, /*read_backward=*/false, builder.on_match_success(),
5288 flags);
5289
5290 optional_step_back->AddAlternative(
5291 GuardedAlternative(builder.ForMatch(match_trail)));
5292 optional_step_back->AddAlternative(GuardedAlternative(on_success));
5293
5294 return optional_step_back;
5295}
5296
5297#if !defined(DART_PRECOMPILED_RUNTIME)
5300 const ParsedFunction* parsed_function,
5301 const ZoneGrowableArray<const ICData*>& ic_data_array,
5302 intptr_t osr_id) {
5303 ASSERT(!FLAG_interpret_irregexp);
5304 Zone* zone = Thread::Current()->zone();
5305
5306 const Function& function = parsed_function->function();
5307 const intptr_t specialization_cid = function.string_specialization_cid();
5308 const bool is_sticky = function.is_sticky_specialization();
5309 const bool is_one_byte = (specialization_cid == kOneByteStringCid);
5310 RegExp& regexp = RegExp::Handle(zone, function.regexp());
5311 const String& pattern = String::Handle(zone, regexp.pattern());
5312
5313 ASSERT(!regexp.IsNull());
5314 ASSERT(!pattern.IsNull());
5315
5316 const bool is_global = regexp.flags().IsGlobal();
5317 const bool is_unicode = regexp.flags().IsUnicode();
5318
5319 RegExpCompiler compiler(data->capture_count, is_one_byte);
5320
5321 // TODO(zerny): Frequency sampling is currently disabled because of several
5322 // issues. We do not want to store subject strings in the regexp object since
5323 // they might be long and we should not prevent their garbage collection.
5324 // Passing them to this function explicitly does not help, since we must
5325 // generate exactly the same IR for both the unoptimizing and optimizing
5326 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5327 // An option would be to store sampling results in the regexp object, but
5328 // I'm not sure the performance gains are relevant enough.
5329
5330 // Wrap the body of the regexp in capture #0.
5331 RegExpNode* captured_body =
5332 RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept());
5333
5334 RegExpNode* node = captured_body;
5335 const bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5336 const bool is_start_anchored = data->tree->IsAnchoredAtStart();
5337 intptr_t max_length = data->tree->max_match();
5338 if (!is_start_anchored && !is_sticky) {
5339 // Add a .*? at the beginning, outside the body capture, unless
5340 // this expression is anchored at the beginning or is sticky.
5342 0, RegExpTree::kInfinity, false,
5343 new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler,
5344 captured_body, data->contains_anchor);
5345
5346 if (data->contains_anchor) {
5347 // Unroll loop once, to take care of the case that might start
5348 // at the start of input.
5349 ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
5350 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5351 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
5352 new (zone) RegExpCharacterClass('*', RegExpFlags()),
5353 /*read_backwards=*/false, loop_node)));
5354 node = first_step_node;
5355 } else {
5356 node = loop_node;
5357 }
5358 }
5359 if (is_one_byte) {
5361 // Do it again to propagate the new nodes to places where they were not
5362 // put because they had not been calculated yet.
5363 if (node != nullptr) {
5365 }
5366 } else if (is_unicode && (is_global || is_sticky)) {
5367 node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags());
5368 }
5369
5370 if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
5371 data->node = node;
5372 Analysis analysis(is_one_byte);
5373 analysis.EnsureAnalyzed(node);
5374 if (analysis.has_failed()) {
5375 const char* error_message = analysis.error_message();
5376 return CompilationResult(error_message);
5377 }
5378
5379 // Native regexp implementation.
5380
5381 IRRegExpMacroAssembler* macro_assembler = new (zone)
5382 IRRegExpMacroAssembler(specialization_cid, data->capture_count,
5383 parsed_function, ic_data_array, osr_id, zone);
5384
5385 // Inserted here, instead of in Assembler, because it depends on information
5386 // in the AST that isn't replicated in the Node structure.
5387 const intptr_t kMaxBacksearchLimit = 1024;
5388 if (is_end_anchored && !is_start_anchored && !is_sticky &&
5389 max_length < kMaxBacksearchLimit) {
5390 macro_assembler->SetCurrentPositionFromEnd(max_length);
5391 }
5392
5393 if (is_global) {
5395 if (data->tree->min_match() > 0) {
5397 } else if (is_unicode) {
5399 }
5400 macro_assembler->set_global_mode(mode);
5401 }
5402
5404 compiler.Assemble(macro_assembler, node, data->capture_count, pattern);
5405
5406 if (FLAG_trace_irregexp) {
5407 macro_assembler->PrintBlocks();
5408 }
5409
5410 return result;
5411}
5412#endif // !defined(DART_PRECOMPILED_RUNTIME)
5413
5416 const RegExp& regexp,
5417 bool is_one_byte,
5418 bool is_sticky,
5419 Zone* zone) {
5420 ASSERT(FLAG_interpret_irregexp);
5421 const String& pattern = String::Handle(zone, regexp.pattern());
5422
5423 ASSERT(!regexp.IsNull());
5424 ASSERT(!pattern.IsNull());
5425
5426 const bool is_global = regexp.flags().IsGlobal();
5427 const bool is_unicode = regexp.flags().IsUnicode();
5428
5429 RegExpCompiler compiler(data->capture_count, is_one_byte);
5430
5431 // TODO(zerny): Frequency sampling is currently disabled because of several
5432 // issues. We do not want to store subject strings in the regexp object since
5433 // they might be long and we should not prevent their garbage collection.
5434 // Passing them to this function explicitly does not help, since we must
5435 // generate exactly the same IR for both the unoptimizing and optimizing
5436 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5437 // An option would be to store sampling results in the regexp object, but
5438 // I'm not sure the performance gains are relevant enough.
5439
5440 // Wrap the body of the regexp in capture #0.
5441 RegExpNode* captured_body =
5442 RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept());
5443
5444 RegExpNode* node = captured_body;
5445 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5446 bool is_start_anchored = data->tree->IsAnchoredAtStart();
5447 intptr_t max_length = data->tree->max_match();
5448 if (!is_start_anchored && !is_sticky) {
5449 // Add a .*? at the beginning, outside the body capture, unless
5450 // this expression is anchored at the beginning.
5452 0, RegExpTree::kInfinity, false,
5453 new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler,
5454 captured_body, data->contains_anchor);
5455
5456 if (data->contains_anchor) {
5457 // Unroll loop once, to take care of the case that might start
5458 // at the start of input.
5459 ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
5460 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5461 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
5462 new (zone) RegExpCharacterClass('*', RegExpFlags()),
5463 /*read_backwards=*/false, loop_node)));
5464 node = first_step_node;
5465 } else {
5466 node = loop_node;
5467 }
5468 }
5469 if (is_one_byte) {
5471 // Do it again to propagate the new nodes to places where they were not
5472 // put because they had not been calculated yet.
5473 if (node != nullptr) {
5475 }
5476 } else if (is_unicode && (is_global || is_sticky)) {
5477 node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags());
5478 }
5479
5480 if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
5481 data->node = node;
5482 Analysis analysis(is_one_byte);
5483 analysis.EnsureAnalyzed(node);
5484 if (analysis.has_failed()) {
5485 const char* error_message = analysis.error_message();
5486 return CompilationResult(error_message);
5487 }
5488
5489 // Bytecode regexp implementation.
5490
5492 BytecodeRegExpMacroAssembler* macro_assembler =
5493 new (zone) BytecodeRegExpMacroAssembler(&buffer, zone);
5494
5495 // Inserted here, instead of in Assembler, because it depends on information
5496 // in the AST that isn't replicated in the Node structure.
5497 const intptr_t kMaxBacksearchLimit = 1024;
5498 if (is_end_anchored && !is_start_anchored && !is_sticky &&
5499 max_length < kMaxBacksearchLimit) {
5500 macro_assembler->SetCurrentPositionFromEnd(max_length);
5501 }
5502
5503 if (is_global) {
5505 if (data->tree->min_match() > 0) {
5507 } else if (is_unicode) {
5509 }
5510 macro_assembler->set_global_mode(mode);
5511 }
5512
5514 compiler.Assemble(macro_assembler, node, data->capture_count, pattern);
5515
5516 if (FLAG_trace_irregexp) {
5517 macro_assembler->PrintBlocks();
5518 }
5519
5520 return result;
5521}
5522
5524 Zone* zone,
5525 const RegExp& regexp,
5526 intptr_t specialization_cid,
5527 bool sticky,
5528 const Object& owner) {
5529 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;
5530
5531 const FunctionType& signature =
5533 const String& pattern = String::Handle(zone, regexp.pattern());
5534 Function& fn =
5535 Function::Handle(zone, Function::New(signature, pattern,
5536 UntaggedFunction::kIrregexpFunction,
5537 true, // Static.
5538 false, // Not const.
5539 false, // Not abstract.
5540 false, // Not external.
5541 false, // Not native.
5543
5544 // TODO(zerny): Share these arrays between all irregexp functions.
5545 // TODO(regis): Better, share a common signature.
5546 signature.set_num_fixed_parameters(kParamCount);
5547 signature.set_parameter_types(
5548 Array::Handle(zone, Array::New(kParamCount, Heap::kOld)));
5549 fn.CreateNameArray();
5551 Object::dynamic_type());
5553 Symbols::This());
5555 Object::dynamic_type());
5557 Symbols::string_param());
5559 Object::dynamic_type());
5561 Symbols::start_index_param());
5562 signature.set_result_type(Type::Handle(zone, Type::ArrayType()));
5563
5564 // Cache the result.
5565 regexp.set_function(specialization_cid, sticky, fn);
5566
5567 fn.SetRegExpData(regexp, specialization_cid, sticky);
5568 fn.set_is_debuggable(false);
5569
5570 // The function is compiled lazily during the first call.
5571}
5572
5574 const String& pattern,
5576 Zone* zone = thread->zone();
5577 const RegExp& regexp = RegExp::Handle(RegExp::New(zone));
5578
5579 regexp.set_pattern(pattern);
5580 regexp.set_flags(flags);
5581
5582 // TODO(zerny): We might want to use normal string searching algorithms
5583 // for simple patterns.
5584 regexp.set_is_complex();
5585 regexp.set_is_global(); // All dart regexps are global.
5586
5587 if (!FLAG_interpret_irregexp) {
5588 const Library& lib = Library::Handle(zone, Library::CoreLibrary());
5589 const Class& owner =
5590 Class::Handle(zone, lib.LookupClass(Symbols::RegExp()));
5591
5592 for (intptr_t cid = kOneByteStringCid; cid <= kTwoByteStringCid; cid++) {
5593 CreateSpecializedFunction(thread, zone, regexp, cid, /*sticky=*/false,
5594 owner);
5595 CreateSpecializedFunction(thread, zone, regexp, cid, /*sticky=*/true,
5596 owner);
5597 }
5598 }
5599
5600 return regexp.ptr();
5601}
5602
5603} // namespace dart
static const int outset
Definition BlurTest.cpp:58
static void fail(const SkString &err)
Definition DM.cpp:234
static bool match(const char *needle, const char *haystack)
Definition DM.cpp:1132
static void info(const char *fmt,...) SK_PRINTF_LIKE(1
Definition DM.cpp:213
static constexpr uint64_t kBits
Definition DrawPass.cpp:55
static constexpr uint64_t kMask
Definition DrawPass.cpp:53
int count
static const int points[]
SkPoint pos
static void is_empty(skiatest::Reporter *reporter, const SkPath &p)
static float next(float f)
static const char marker[]
#define check(reporter, ref, unref, make, kill)
static bool read(SkStream *stream, void *buffer, size_t amount)
static bool skip(SkStream *stream, size_t amount)
static bool ok(int result)
static bool left(const SkPoint &p0, const SkPoint &p1)
static bool right(const SkPoint &p0, const SkPoint &p1)
static SkScalar center(float pos0, float pos1)
#define UNREACHABLE()
Definition assert.h:248
#define COMPILE_ASSERT(expr)
Definition assert.h:339
#define Z
intptr_t reg
Definition regexp.h:618
intptr_t clear_register_count
Definition regexp.h:631
intptr_t range_from
Definition regexp.h:640
static ActionNode * BeginSubmatch(intptr_t stack_pointer_reg, intptr_t position_reg, RegExpNode *on_success)
Definition regexp.cc:792
static ActionNode * EmptyMatchCheck(intptr_t start_register, intptr_t repetition_register, intptr_t repetition_limit, RegExpNode *on_success)
Definition regexp.cc:816
@ POSITIVE_SUBMATCH_SUCCESS
Definition regexp.h:569
intptr_t start_register
Definition regexp.h:635
struct dart::ActionNode::@201::@207 u_clear_captures
static ActionNode * SetRegister(intptr_t reg, intptr_t val, RegExpNode *on_success)
Definition regexp.cc:756
static ActionNode * PositiveSubmatchSuccess(intptr_t stack_pointer_reg, intptr_t restore_reg, intptr_t clear_capture_count, intptr_t clear_capture_from, RegExpNode *on_success)
Definition regexp.cc:802
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition regexp.cc:1473
intptr_t stack_pointer_register
Definition regexp.h:629
struct dart::ActionNode::@201::@206 u_empty_match_check
static ActionNode * IncrementRegister(intptr_t reg, RegExpNode *on_success)
Definition regexp.cc:766
static ActionNode * ClearCaptures(Interval range, RegExpNode *on_success)
Definition regexp.cc:784
struct dart::ActionNode::@201::@205 u_submatch
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:3407
struct dart::ActionNode::@201::@203 u_increment_register
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition regexp.cc:1481
struct dart::ActionNode::@201::@204 u_position_register
struct dart::ActionNode::@201::@202 u_store_register
static ActionNode * StorePosition(intptr_t reg, bool is_capture, RegExpNode *on_success)
Definition regexp.cc:774
AlternativeGenerationList(intptr_t count)
Definition regexp.cc:2737
AlternativeGeneration * at(intptr_t i)
Definition regexp.cc:2744
void EnsureAnalyzed(RegExpNode *node)
Definition regexp.cc:5092
const char * error_message()
Definition regexp.h:1417
virtual void VisitLoopChoice(LoopChoiceNode *that)
Definition regexp.cc:5146
bool has_failed()
Definition regexp.h:1416
static ArrayPtr New(intptr_t len, Heap::Space space=Heap::kNew)
Definition object.h:10933
static AssertionNode * AfterNewline(RegExpNode *on_success)
Definition regexp.h:742
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition regexp.cc:1493
static AssertionNode * AtNonBoundary(RegExpNode *on_success)
Definition regexp.h:739
static AssertionNode * AtStart(RegExpNode *on_success)
Definition regexp.h:733
static AssertionNode * AtEnd(RegExpNode *on_success)
Definition regexp.h:730
static AssertionNode * AtBoundary(RegExpNode *on_success)
Definition regexp.h:736
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:2316
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t filled_in, bool not_at_start)
Definition regexp.cc:2304
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition regexp.cc:1506
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition regexp.cc:5172
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:3527
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t recursion_depth, bool not_at_start)
Definition regexp.cc:1516
void TruncateTo(intptr_t length)
void AddArray(const BaseGrowableArray< T, B, Allocator > &src)
void Add(const T &value)
const T & At(intptr_t index) const
intptr_t length() const
Convenience wrapper around a BlockEntryInstr pointer.
bool is_linked() const
void SetAll(intptr_t map_number)
Definition regexp.h:1162
void SetInterval(intptr_t map_number, const Interval &interval)
Definition regexp.h:1152
void Set(intptr_t map_number, intptr_t character)
Definition regexp.h:1146
BoyerMooreLookahead(intptr_t length, RegExpCompiler *compiler, Zone *Zone)
Definition regexp.cc:2825
intptr_t Count(intptr_t map_number)
Definition regexp.h:1140
void SetRest(intptr_t from_map)
Definition regexp.h:1164
void EmitSkipInstructions(RegExpMacroAssembler *masm)
Definition regexp.cc:2948
void Set(intptr_t character)
Definition regexp.cc:2788
static constexpr intptr_t kMapSize
Definition regexp.h:1112
void SetInterval(const Interval &interval)
Definition regexp.cc:2792
virtual void SetCurrentPositionFromEnd(intptr_t by)
static bool IsCanonical(ZoneGrowableArray< CharacterRange > *ranges)
Definition regexp.cc:4772
static void AddClassEscape(uint16_t type, ZoneGrowableArray< CharacterRange > *ranges)
Definition regexp.cc:4651
static CharacterRange Range(int32_t from, int32_t to)
Definition regexp.h:40
static CharacterRange Everything()
Definition regexp.h:44
bool Contains(int32_t i) const
Definition regexp.h:53
static CharacterRange Singleton(int32_t value)
Definition regexp.h:37
int32_t to() const
Definition regexp.h:56
void set_from(int32_t value)
Definition regexp.h:55
int32_t from() const
Definition regexp.h:54
static void Canonicalize(ZoneGrowableArray< CharacterRange > *ranges)
Definition regexp.cc:4876
bool is_valid() const
Definition regexp.h:58
static void AddCaseEquivalents(ZoneGrowableArray< CharacterRange > *ranges, bool is_one_byte, Zone *zone)
Definition regexp.cc:4691
static void Negate(ZoneGrowableArray< CharacterRange > *src, ZoneGrowableArray< CharacterRange > *dst)
Definition regexp.cc:4912
static ZoneGrowableArray< CharacterRange > * List(Zone *zone, CharacterRange range)
Definition regexp.h:47
void set_standard_set_type(uint16_t special_set_type)
Definition regexp_ast.h:140
ZoneGrowableArray< CharacterRange > * ranges()
Definition regexp.cc:4785
virtual bool try_to_emit_quick_check_for_alternative(bool is_first)
Definition regexp.h:928
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition regexp.cc:1582
ZoneGrowableArray< GuardedAlternative > * alternatives_
Definition regexp.h:937
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition regexp.cc:2049
void set_not_at_start()
Definition regexp.h:926
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition regexp.cc:5185
intptr_t EatsAtLeastHelper(intptr_t still_to_find, intptr_t budget, RegExpNode *ignore_this_node, bool not_at_start)
Definition regexp.cc:1557
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)
Definition regexp.cc:2143
void AddAlternative(GuardedAlternative node)
Definition regexp.h:903
intptr_t GreedyLoopTextLengthForAlternative(const GuardedAlternative *alternative)
Definition regexp.cc:2632
ZoneGrowableArray< GuardedAlternative > * alternatives()
Definition regexp.h:904
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:3127
bool not_at_start()
Definition regexp.h:925
static const int32_t kNoKey
Definition regexp.h:149
void set_to(int32_t value)
Definition regexp.h:133
void AddValue(int value, Zone *zone)
Definition regexp.h:134
OutSet * Get(int32_t value)
Definition regexp.cc:5079
void AddRange(CharacterRange range, int32_t value, Zone *zone)
Definition regexp.cc:4989
void ForEach(Callback *callback)
Definition regexp.h:166
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:728
intptr_t Frequency(intptr_t in_character)
Definition regexp.cc:261
void CountCharacter(intptr_t character)
Definition regexp.cc:253
void set_result_type(const AbstractType &value) const
Definition object.cc:8633
void SetParameterTypeAt(intptr_t index, const AbstractType &value) const
Definition object.cc:8648
void set_num_fixed_parameters(intptr_t value) const
Definition object.cc:11659
void set_parameter_types(const Array &value) const
Definition object.cc:8655
static FunctionTypePtr New(intptr_t num_parent_type_arguments=0, Nullability nullability=Nullability::kLegacy, Heap::Space space=Heap::kOld)
Definition object.cc:11682
void CreateNameArray(Heap::Space space=Heap::kOld) const
Definition object.cc:8735
static FunctionPtr New(const FunctionType &signature, const String &name, UntaggedFunction::Kind kind, bool is_static, bool is_const, bool is_abstract, bool is_external, bool is_native, const Object &owner, TokenPosition token_pos, Heap::Space space=Heap::kOld)
Definition object.cc:10301
intptr_t string_specialization_cid() const
Definition object.cc:8538
void SetParameterNameAt(intptr_t index, const String &value) const
Definition object.cc:8681
void SetRegExpData(const RegExp &regexp, intptr_t string_specialization_cid, bool sticky) const
Definition object.cc:8550
GreedyLoopState(bool not_at_start)
Definition regexp.cc:3091
Trace * counter_backtrack_trace()
Definition regexp.h:1366
BlockLabel * label()
Definition regexp.h:1365
intptr_t reg()
Definition regexp.h:866
intptr_t value()
Definition regexp.h:868
Relation op()
Definition regexp.h:867
RegExpNode * node() const
Definition regexp.h:881
ZoneGrowableArray< Guard * > * guards() const
Definition regexp.h:883
void AddGuard(Guard *guard, Zone *zone)
Definition regexp.cc:751
@ kOld
Definition heap.h:39
virtual void SetCurrentPositionFromEnd(intptr_t by)
bool Contains(intptr_t value) const
Definition regexp.h:524
intptr_t to() const
Definition regexp.h:529
intptr_t from() const
Definition regexp.h:528
bool is_empty() const
Definition regexp.h:527
static uint16_t TryConvertToLatin1(uint16_t c)
Definition unicode.h:238
static LibraryPtr CoreLibrary()
Definition object.cc:14834
ClassPtr LookupClass(const String &name) const
Definition object.cc:14152
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition regexp.cc:1576
virtual void Accept(NodeVisitor *visitor)
Definition regexp.cc:835
void AddContinueAlternative(GuardedAlternative alt)
Definition regexp.cc:2660
RegExpNode * loop_node()
Definition regexp.h:1034
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:2666
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition regexp.cc:2130
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)
Definition regexp.cc:2120
void AddLoopAlternative(GuardedAlternative alt)
Definition regexp.cc:2654
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition regexp.cc:2033
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition regexp.cc:1536
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition regexp.cc:2099
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)
Definition regexp.cc:1546
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:702
virtual void VisitLoopChoice(LoopChoiceNode *that)
Definition regexp.h:1390
static void static void PrintErr(const char *format,...) PRINTF_ATTRIBUTE(1
ObjectPtr ptr() const
Definition object.h:332
bool IsNull() const
Definition object.h:363
static Object & Handle()
Definition object.h:407
static Object & ZoneHandle()
Definition object.h:419
OutSet * Extend(unsigned value, Zone *zone)
Definition regexp.cc:4947
bool Get(unsigned value) const
Definition regexp.cc:4977
const Function & function() const
Definition parser.h:73
intptr_t characters()
Definition regexp.h:359
Position * positions(intptr_t index)
Definition regexp.h:361
bool Rationalize(bool one_byte)
Definition regexp.cc:1598
void Advance(intptr_t by, bool one_byte)
Definition regexp.cc:1871
void Merge(QuickCheckDetails *other, intptr_t from_index)
Definition regexp.cc:1892
RecursionCheck(RegExpCompiler *compiler)
Definition regexp.cc:375
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4584
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4433
AssertionType assertion_type() const
Definition regexp_ast.h:121
intptr_t length() const
Definition regexp_ast.h:232
RegExpFlags flags() const
Definition regexp_ast.h:233
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:3795
virtual void AppendToText(RegExpText *text)
Definition regexp.cc:212
ZoneGrowableArray< uint16_t > * data() const
Definition regexp_ast.h:231
bool ignore_case() const
Definition regexp_ast.h:234
intptr_t index() const
Definition regexp_ast.h:419
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4489
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4562
static intptr_t EndRegister(intptr_t index)
Definition regexp_ast.h:338
static intptr_t StartRegister(intptr_t index)
Definition regexp_ast.h:337
uint16_t standard_type() const
Definition regexp_ast.h:205
virtual void AppendToText(RegExpText *text)
Definition regexp.cc:216
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4166
bool contains_split_surrogate() const
Definition regexp_ast.h:209
ZoneGrowableArray< CharacterRange > * ranges()
Definition regexp_ast.h:206
intptr_t AllocateRegister()
Definition regexp.cc:296
static constexpr intptr_t kImplementationOffset
Definition regexp.cc:329
void AddWork(RegExpNode *node)
Definition regexp.cc:327
void set_read_backward(bool value)
Definition regexp.cc:345
static constexpr intptr_t kMaxRecursion
Definition regexp.cc:336
RegExpCompiler(intptr_t capture_count, bool is_one_byte)
Definition regexp.cc:390
void set_current_expansion_factor(intptr_t value)
Definition regexp.cc:349
Zone * zone() const
Definition regexp.cc:353
RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler *assembler, RegExpNode *start, intptr_t capture_count, const String &pattern)
Definition regexp.cc:405
void DecrementRecursionDepth()
Definition regexp.cc:339
FrequencyCollator * frequency_collator()
Definition regexp.cc:346
intptr_t UnicodeLookaroundPositionRegister()
Definition regexp.cc:307
static constexpr intptr_t kCodeOffset
Definition regexp.cc:331
bool one_byte() const
Definition regexp.cc:343
EndNode * accept()
Definition regexp.cc:334
static constexpr intptr_t kNoRegister
Definition regexp.cc:355
intptr_t UnicodeLookaroundStackRegister()
Definition regexp.cc:300
RegExpMacroAssembler * macro_assembler()
Definition regexp.cc:333
intptr_t recursion_depth()
Definition regexp.cc:337
void IncrementRecursionDepth()
Definition regexp.cc:338
intptr_t current_expansion_factor()
Definition regexp.cc:348
static constexpr intptr_t kNumberOfRegistersOffset
Definition regexp.cc:330
ZoneGrowableArray< RegExpTree * > * alternatives() const
Definition regexp_ast.h:73
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4203
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4496
static void DotPrint(const char *label, RegExpNode *node, bool ignore_case)
static CompilationResult CompileBytecode(RegExpCompileData *data, const RegExp &regexp, bool is_one_byte, bool sticky, Zone *zone)
Definition regexp.cc:5414
static CompilationResult CompileIR(RegExpCompileData *input, const ParsedFunction *parsed_function, const ZoneGrowableArray< const ICData * > &ic_data_array, intptr_t osr_id)
Definition regexp.cc:5298
static RegExpPtr CreateRegExp(Thread *thread, const String &pattern, RegExpFlags flags)
Definition regexp.cc:5573
RegExpExpansionLimiter(RegExpCompiler *compiler, intptr_t factor)
Definition regexp.cc:4226
static constexpr intptr_t kMaxExpansionFactor
Definition regexp.cc:4225
bool NeedsUnicodeCaseEquivalents()
Definition object.h:12702
bool IsGlobal() const
Definition object.h:12696
bool IsUnicode() const
Definition object.h:12699
bool IgnoreCase() const
Definition object.h:12697
Builder(bool is_positive, RegExpNode *on_success, intptr_t stack_pointer_register, intptr_t position_register, intptr_t capture_register_count=0, intptr_t capture_register_start=0)
Definition regexp.cc:4501
RegExpNode * ForMatch(RegExpNode *match)
Definition regexp.cc:4522
RegExpTree * body() const
Definition regexp_ast.h:368
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4540
bool is_positive() const
Definition regexp_ast.h:369
void set_global_mode(GlobalMode mode)
virtual void CheckCharacterLT(uint16_t limit, BlockLabel *on_less)=0
virtual void CheckCharacterInRange(uint16_t from, uint16_t to, BlockLabel *on_in_range)=0
virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg, bool read_backward, bool unicode, BlockLabel *on_no_match)=0
virtual void CheckNotCharacter(unsigned c, BlockLabel *on_not_equal)=0
virtual void ReadStackPointerFromRegister(intptr_t reg)=0
static constexpr intptr_t kTableSizeBits
static constexpr intptr_t kTableMask
virtual void PopCurrentPosition()=0
virtual void CheckNotAtStart(intptr_t cp_offset, BlockLabel *on_not_at_start)=0
virtual void CheckPosition(intptr_t cp_offset, BlockLabel *on_outside_input)
virtual void CheckAtStart(BlockLabel *on_at_start)=0
static constexpr intptr_t kTableSize
virtual void CheckCharacterAfterAnd(unsigned c, unsigned and_with, BlockLabel *on_equal)=0
virtual void IfRegisterGE(intptr_t reg, intptr_t comparand, BlockLabel *if_ge)=0
virtual void ReadCurrentPositionFromRegister(intptr_t reg)=0
virtual void WriteCurrentPositionToRegister(intptr_t reg, intptr_t cp_offset)=0
virtual void CheckCharacterNotInRange(uint16_t from, uint16_t to, BlockLabel *on_not_in_range)=0
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, BlockLabel *on_not_equal)=0
virtual void IfRegisterEqPos(intptr_t reg, BlockLabel *if_eq)=0
virtual void CheckCharacterGT(uint16_t limit, BlockLabel *on_greater)=0
virtual void CheckNotBackReference(intptr_t start_reg, bool read_backward, BlockLabel *on_no_match)=0
virtual bool Succeed()=0
virtual void PushCurrentPosition()=0
virtual void AdvanceCurrentPosition(intptr_t by)=0
virtual void BindBlock(BlockLabel *label)=0
virtual void CheckCharacter(unsigned c, BlockLabel *on_equal)=0
virtual void IfRegisterLT(intptr_t reg, intptr_t comparand, BlockLabel *if_lt)=0
virtual void CheckGreedyLoop(BlockLabel *on_tos_equals_current_position)=0
virtual void GoTo(BlockLabel *to)=0
virtual void CheckBitInTable(const TypedData &table, BlockLabel *on_bit_set)=0
void CheckNotInSurrogatePair(intptr_t cp_offset, BlockLabel *on_failure)
virtual bool CheckSpecialCharacterClass(uint16_t type, BlockLabel *on_no_match)
virtual void Backtrack()=0
virtual void CheckNotCharacterAfterMinusAnd(uint16_t c, uint16_t minus, uint16_t and_with, BlockLabel *on_not_equal)=0
virtual void LoadCurrentCharacter(intptr_t cp_offset, BlockLabel *on_end_of_input, bool check_bounds=true, intptr_t characters=1)=0
virtual void PushBacktrack(BlockLabel *label)=0
virtual void WriteStackPointerToRegister(intptr_t reg)=0
virtual void ClearRegisters(intptr_t reg_from, intptr_t reg_to)=0
static constexpr intptr_t kMaxCPOffset
virtual void CheckPreemption(bool is_backtrack)
virtual void Accept(NodeVisitor *visitor)=0
virtual void Emit(RegExpCompiler *compiler, Trace *trace)=0
NodeInfo * info()
Definition regexp.h:477
static constexpr intptr_t kNodeIsTooComplexForGreedyLoops
Definition regexp.h:422
virtual ~RegExpNode()
Definition regexp.cc:1429
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition regexp.h:449
LimitResult LimitVersions(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:1431
bool EmitQuickCheck(RegExpCompiler *compiler, Trace *bounds_check_trace, Trace *trace, bool preload_has_checked_bounds, BlockLabel *on_possible_success, QuickCheckDetails *details_return, bool fall_through_on_failure)
Definition regexp.cc:1621
Zone * zone() const
Definition regexp.h:483
BoyerMooreLookahead * bm_info(bool not_at_start)
Definition regexp.h:479
virtual intptr_t GreedyLoopTextLength()
Definition regexp.h:423
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition regexp.h:439
RegExpNode(Zone *zone)
Definition regexp.h:386
static constexpr intptr_t kRecursionBudget
Definition regexp.h:438
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)=0
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)=0
intptr_t max() const
Definition regexp_ast.h:296
intptr_t min() const
Definition regexp_ast.h:295
RegExpTree * body() const
Definition regexp_ast.h:300
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4216
bool is_greedy() const
Definition regexp_ast.h:299
virtual void AppendToText(RegExpText *text)
Definition regexp.cc:220
GrowableArray< TextElement > * elements()
Definition regexp_ast.h:256
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:3803
virtual intptr_t min_match() const =0
virtual Interval CaptureRegisters() const
Definition regexp_ast.h:51
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)=0
static constexpr intptr_t kInfinity
Definition regexp_ast.h:39
virtual void AppendToText(RegExpText *text)
Definition regexp.cc:208
void set_pattern(const String &pattern) const
Definition object.cc:26691
StringPtr pattern() const
Definition object.h:12771
void set_function(intptr_t cid, bool sticky, const Function &value) const
Definition object.cc:26695
static RegExpPtr New(Zone *zone, Heap::Space space=Heap::kNew)
Definition object.cc:26741
void set_is_complex() const
Definition object.h:12857
void set_flags(RegExpFlags flags) const
Definition object.h:12868
RegExpFlags flags() const
Definition object.h:12865
void set_is_global() const
Definition object.h:12841
RegExpNode * on_success()
Definition regexp.h:544
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition regexp.cc:1931
RegExpNode * FilterSuccessor(intptr_t depth)
Definition regexp.cc:1939
bool Insert(const Key &key, Locator *locator)
@ kMaxOneCharCodeSymbol
Definition symbols.h:576
static const String & This()
Definition symbols.h:691
static TextElement CharClass(RegExpCharacterClass *char_class)
Definition regexp.cc:229
TextType text_type() const
Definition regexp.h:246
RegExpCharacterClass * char_class() const
Definition regexp.h:255
intptr_t length() const
Definition regexp.cc:233
static TextElement Atom(RegExpAtom *atom)
Definition regexp.cc:225
intptr_t cp_offset() const
Definition regexp.h:242
RegExpAtom * atom() const
Definition regexp.h:250
void set_cp_offset(intptr_t cp_offset)
Definition regexp.h:243
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition regexp.cc:1977
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition regexp.cc:2514
virtual RegExpNode * GetSuccessorOfOmnivorousTextNode(RegExpCompiler *compiler)
Definition regexp.cc:2604
virtual intptr_t GreedyLoopTextLength()
Definition regexp.cc:2599
static TextNode * CreateForCharacterRanges(ZoneGrowableArray< CharacterRange > *ranges, bool read_backward, RegExpNode *on_success, RegExpFlags flags)
Definition regexp.cc:2480
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition regexp.cc:1524
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition regexp.cc:5203
void MakeCaseIndependent(bool is_one_byte)
Definition regexp.cc:2581
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)
Definition regexp.cc:1700
static TextNode * CreateForSurrogatePair(CharacterRange lead, CharacterRange trail, bool read_backward, RegExpNode *on_success, RegExpFlags flags)
Definition regexp.cc:2491
void CalculateOffsets()
Definition regexp.cc:5104
Zone * zone() const
static Thread * Current()
Definition thread.h:361
static const TokenPosition kMinSource
DeferredAction * next()
Definition regexp.h:1212
bool Mentions(intptr_t reg)
Definition regexp.cc:461
ActionNode::ActionType action_type()
Definition regexp.h:1215
intptr_t characters_preloaded()
Definition regexp.h:1305
QuickCheckDetails * quick_check_performed()
Definition regexp.h:1308
bool GetStoredPosition(intptr_t reg, intptr_t *cp_offset)
Definition regexp.cc:478
void set_loop_label(BlockLabel *label)
Definition regexp.h:1323
void InvalidateCurrentCharacter()
Definition regexp.cc:2558
void set_at_start(TriBool at_start)
Definition regexp.h:1301
void set_bound_checked_up_to(intptr_t to)
Definition regexp.h:1327
BlockLabel * loop_label()
Definition regexp.h:1303
void Flush(RegExpCompiler *compiler, RegExpNode *successor)
Definition regexp.cc:649
void set_characters_preloaded(intptr_t count)
Definition regexp.h:1324
bool mentions_reg(intptr_t reg)
Definition regexp.cc:470
RegExpNode * stop_node()
Definition regexp.h:1304
void set_stop_node(RegExpNode *node)
Definition regexp.h:1322
intptr_t cp_offset()
Definition regexp.h:1283
bool is_trivial()
Definition regexp.h:1295
BlockLabel * backtrack()
Definition regexp.h:1302
intptr_t bound_checked_up_to()
Definition regexp.h:1306
TriBool at_start()
Definition regexp.h:1300
void set_quick_check_performed(QuickCheckDetails *d)
Definition regexp.h:1329
DeferredAction * actions()
Definition regexp.h:1284
intptr_t flush_budget()
Definition regexp.h:1307
void add_action(DeferredAction *new_action)
Definition regexp.h:1316
void set_flush_budget(intptr_t to)
Definition regexp.h:1328
void AdvanceCurrentPositionInTrace(intptr_t by, RegExpCompiler *compiler)
Definition regexp.cc:2562
void set_backtrack(BlockLabel *backtrack)
Definition regexp.h:1321
static TypePtr ArrayType()
Definition object.cc:21934
static TypedDataPtr New(intptr_t class_id, intptr_t len, Heap::Space space=Heap::kNew)
Definition object.cc:25666
ZoneGrowableArray< CharacterRange > * lead_surrogates()
Definition regexp.h:186
UnicodeRangeSplitter(Zone *zone, ZoneGrowableArray< CharacterRange > *base)
Definition regexp.cc:3899
ZoneGrowableArray< CharacterRange > * trail_surrogates()
Definition regexp.h:189
ZoneGrowableArray< CharacterRange > * non_bmp() const
Definition regexp.h:192
ZoneGrowableArray< CharacterRange > * bmp()
Definition regexp.h:185
void Call(uint32_t from, ChoiceTable::Entry entry)
Definition regexp.cc:3939
static constexpr int32_t kLeadSurrogateStart
Definition unicode.h:159
static constexpr int32_t kMaxCodeUnit
Definition unicode.h:158
static constexpr int32_t kTrailSurrogateStart
Definition unicode.h:161
static void Encode(int32_t codepoint, uint16_t *dst)
Definition unicode.cc:273
static constexpr int32_t kTrailSurrogateEnd
Definition unicode.h:162
static constexpr int32_t kLeadSurrogateEnd
Definition unicode.h:160
static constexpr int32_t kMaxCodePoint
Definition unicode.h:18
static constexpr int32_t kInvalidChar
Definition unicode.h:19
static constexpr T Maximum(T x, T y)
Definition utils.h:26
static T Minimum(T x, T y)
Definition utils.h:21
VisitMarker(NodeInfo *info)
Definition regexp.cc:1921
intptr_t get(int32_t c, int32_t n, int32_t *result)
Definition unibrow-inl.h:15
static constexpr int kSize
#define DECLARE_VISIT(type, attrs)
#define UNIMPLEMENTED
#define ASSERT(E)
EMSCRIPTEN_KEEPALIVE void empty()
AtkStateType state
FlutterSemanticsFlag flags
glong glong end
static const uint8_t buffer[]
uint8_t value
GAsyncResult * result
uint32_t * target
Dart_NativeFunction function
Definition fuchsia.cc:51
const char * name
Definition fuchsia.cc:50
static float max(float r, float g, float b)
Definition hsl.cpp:49
static float min(float r, float g, float b)
Definition hsl.cpp:48
#define DEFINE_ACCEPT(ShortName, Attrs)
Definition il.cc:1261
size_t length
std::u16string text
static constexpr intptr_t kWordRangeCount
Definition regexp.cc:2778
static void SplitSearchSpace(ZoneGrowableArray< uint16_t > *ranges, intptr_t start_index, intptr_t end_index, intptr_t *new_start_index, intptr_t *new_end_index, uint16_t *border)
Definition regexp.cc:1148
static constexpr intptr_t kMaxLookaheadForBoyerMoore
Definition regexp.cc:34
static constexpr int32_t kLineTerminatorRanges[]
Definition regexp.cc:2783
static intptr_t InsertRangeInCanonicalList(ZoneGrowableArray< CharacterRange > *list, intptr_t count, CharacterRange insert)
Definition regexp.cc:4811
static bool CompareInverseRanges(ZoneGrowableArray< CharacterRange > *ranges, const int32_t *special_class, intptr_t length)
Definition regexp.cc:3813
static void AddClassNegated(const int32_t *elmv, intptr_t elmc, ZoneGrowableArray< CharacterRange > *ranges)
Definition regexp.cc:4611
constexpr int32_t kMinInt32
Definition globals.h:482
void PrintUtf16(uint16_t c)
void AddBmpCharacters(RegExpCompiler *compiler, ChoiceNode *result, RegExpNode *on_success, UnicodeRangeSplitter *splitter)
Definition regexp.cc:3959
static void EmitCharClass(RegExpMacroAssembler *macro_assembler, RegExpCharacterClass *cc, bool one_byte, BlockLabel *on_failure, intptr_t cp_offset, bool check_offset, bool preloaded, Zone *zone)
Definition regexp.cc:1331
static constexpr intptr_t kSurrogateRangeCount
Definition regexp.cc:2782
static void EmitHat(RegExpCompiler *compiler, RegExpNode *on_success, Trace *trace)
Definition regexp.cc:2188
static constexpr int32_t kSpaceRanges[]
Definition regexp.cc:2771
void AddLoneLeadSurrogates(RegExpCompiler *compiler, ChoiceNode *result, RegExpNode *on_success, UnicodeRangeSplitter *splitter)
Definition regexp.cc:4065
static constexpr bool kRegexpOptimization
Definition regexp.cc:31
static uint16_t ConvertNonLatin1ToLatin1(uint16_t c)
Definition regexp.cc:1962
static constexpr int32_t kRangeEndMarker
Definition regexp.cc:2764
static void UpdateBoundsCheck(intptr_t index, intptr_t *checked_up_to)
Definition regexp.cc:2357
static bool ShortCutEmitCharacterPair(RegExpMacroAssembler *macro_assembler, bool one_byte, uint16_t c1, uint16_t c2, BlockLabel *on_failure)
Definition regexp.cc:935
ContainedInLattice AddRange(ContainedInLattice containment, const int32_t *ranges, intptr_t ranges_length, Interval new_range)
Definition regexp.cc:36
intptr_t word
Definition globals.h:500
static bool RangeContainsLatin1Equivalents(CharacterRange range)
Definition regexp.cc:1947
static void EmitDoubleBoundaryTest(RegExpMacroAssembler *masm, uint16_t first, uint16_t last, BlockLabel *fall_through, BlockLabel *in_range, BlockLabel *out_of_range)
Definition regexp.cc:1040
@ kNoRegister
void CreateSpecializedFunction(Thread *thread, Zone *zone, const RegExp &regexp, intptr_t specialization_cid, bool sticky, const Object &owner)
Definition regexp.cc:5523
void AddNonBmpSurrogatePairs(RegExpCompiler *compiler, ChoiceNode *result, RegExpNode *on_success, UnicodeRangeSplitter *splitter)
Definition regexp.cc:3969
RegExpNode * MatchAndNegativeLookaroundInReadDirection(RegExpCompiler *compiler, ZoneGrowableArray< CharacterRange > *match, ZoneGrowableArray< CharacterRange > *lookahead, RegExpNode *on_success, bool read_backward, RegExpFlags flags)
Definition regexp.cc:4048
static constexpr int32_t kWordRanges[]
Definition regexp.cc:2776
static constexpr int32_t kSurrogateRanges[]
Definition regexp.cc:2781
static constexpr intptr_t kSpaceRangeCount
Definition regexp.cc:2775
static bool EmitAtomLetter(Zone *zone, RegExpCompiler *compiler, uint16_t c, BlockLabel *on_failure, intptr_t cp_offset, bool check, bool preloaded)
Definition regexp.cc:981
static void EmitWordCheck(RegExpMacroAssembler *assembler, BlockLabel *word, BlockLabel *non_word, bool fall_through_on_word)
Definition regexp.cc:2163
static void USE(T &&)
Definition globals.h:618
RegExpNode * OptionallyStepBackToLeadSurrogate(RegExpCompiler *compiler, RegExpNode *on_success, RegExpFlags flags)
Definition regexp.cc:5263
bool EmitCharacterFunction(Zone *zone, RegExpCompiler *compiler, uint16_t c, BlockLabel *on_failure, intptr_t cp_offset, bool check, bool preloaded)
Definition regexp.cc:971
void AddUnicodeCaseEquivalents(ZoneGrowableArray< CharacterRange > *ranges)
Definition regexp.cc:4135
ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b)
Definition regexp.h:1087
const intptr_t cid
void AddLoneTrailSurrogates(RegExpCompiler *compiler, ChoiceNode *result, RegExpNode *on_success, UnicodeRangeSplitter *splitter)
Definition regexp.cc:4093
static bool ArrayContains(ZoneGrowableArray< unsigned > *array, unsigned value)
Definition regexp.cc:4938
static void CutOutRange(RegExpMacroAssembler *masm, ZoneGrowableArray< uint16_t > *ranges, intptr_t start_index, intptr_t end_index, intptr_t cut_index, BlockLabel *even_label, BlockLabel *odd_label)
Definition regexp.cc:1121
RegExpNode * NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler *compiler, ZoneGrowableArray< CharacterRange > *lookbehind, ZoneGrowableArray< CharacterRange > *match, RegExpNode *on_success, bool read_backward, RegExpFlags flags)
Definition regexp.cc:4030
static constexpr intptr_t kDigitRangeCount
Definition regexp.cc:2780
static uint32_t SmearBitsRight(uint32_t v)
Definition regexp.cc:1589
static void AddClass(const int32_t *elmv, intptr_t elmc, ZoneGrowableArray< CharacterRange > *ranges)
Definition regexp.cc:4600
static intptr_t GetCaseIndependentLetters(uint16_t character, bool one_byte_subject, int32_t *letters)
Definition regexp.cc:861
static void GenerateBranches(RegExpMacroAssembler *masm, ZoneGrowableArray< uint16_t > *ranges, intptr_t start_index, intptr_t end_index, uint16_t min_char, uint16_t max_char, BlockLabel *fall_through, BlockLabel *even_label, BlockLabel *odd_label)
Definition regexp.cc:1217
static bool EmitAtomNonLetter(Zone *zone, RegExpCompiler *compiler, uint16_t c, BlockLabel *on_failure, intptr_t cp_offset, bool check, bool preloaded)
Definition regexp.cc:902
static int8_t data[kExtLength]
ContainedInLattice
Definition regexp.h:1080
@ kLatticeOut
Definition regexp.h:1083
@ kLatticeUnknown
Definition regexp.h:1084
@ kLatticeIn
Definition regexp.h:1082
static constexpr intptr_t kLineTerminatorRangeCount
Definition regexp.cc:2785
RegExpNode * UnanchoredAdvance(RegExpCompiler *compiler, RegExpNode *on_success)
Definition regexp.cc:4121
static constexpr int32_t kDigitRanges[]
Definition regexp.cc:2779
static void MoveRanges(ZoneGrowableArray< CharacterRange > *list, intptr_t from, intptr_t to, intptr_t count)
Definition regexp.cc:4795
static void EmitUseLookupTable(RegExpMacroAssembler *masm, ZoneGrowableArray< uint16_t > *ranges, intptr_t start_index, intptr_t end_index, uint16_t min_char, BlockLabel *fall_through, BlockLabel *even_label, BlockLabel *odd_label)
Definition regexp.cc:1064
static void EmitBoundaryTest(RegExpMacroAssembler *masm, uint16_t border, BlockLabel *fall_through, BlockLabel *above_or_equal, BlockLabel *below)
Definition regexp.cc:1027
static RegExpEngine::CompilationResult IrregexpRegExpTooBig()
Definition regexp.cc:384
static bool RangesContainLatin1Equivalents(ZoneGrowableArray< CharacterRange > *ranges)
Definition regexp.cc:1953
static bool EmitSimpleCharacter(Zone *zone, RegExpCompiler *compiler, uint16_t c, BlockLabel *on_failure, intptr_t cp_offset, bool check, bool preloaded)
Definition regexp.cc:883
static bool DeterminedAlready(QuickCheckDetails *quick_check, intptr_t offset)
Definition regexp.cc:2351
static bool CompareRanges(ZoneGrowableArray< CharacterRange > *ranges, const int32_t *special_class, intptr_t length)
Definition regexp.cc:3843
SI auto map(std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
Definition SkVx.h:680
#define FALL_THROUGH
Definition globals.h:15
#define Pd
Definition globals.h:408
#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName)
Definition globals.h:593
#define DISALLOW_ALLOCATION()
Definition globals.h:604
#define DISALLOW_COPY_AND_ASSIGN(TypeName)
Definition globals.h:581
#define FOR_EACH_NODE_TYPE(VISIT)
Definition regexp.h:210
Point offset
QuickCheckDetails quick_check_details
Definition regexp.cc:2730
bool been_analyzed
Definition regexp.h:324
bool being_analyzed
Definition regexp.h:323
static constexpr intptr_t kEatsAtLeastNotYetInitialized
Definition regexp.h:1374
intptr_t preload_characters_
Definition regexp.h:1377
intptr_t eats_at_least_
Definition regexp.h:1378
static constexpr intptr_t kMaxWidth
Definition unibrow.h:56
#define ARRAY_SIZE(array)
Definition globals.h:72
#define OZ
Definition regexp.cc:3793
WORD ATOM