Flutter Engine
The Flutter Engine
regexp.cc
Go to the documentation of this file.
1// Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2// for details. All rights reserved. Use of this source code is governed by a
3// BSD-style license that can be found in the LICENSE file.
4
5#include "vm/regexp.h"
6
7#include <memory>
8
10#include "platform/unicode.h"
11
12#include "unicode/uniset.h"
13
14#include "vm/dart_entry.h"
15#include "vm/regexp_assembler.h"
17#include "vm/regexp_ast.h"
18#include "vm/symbols.h"
19#include "vm/thread.h"
20#include "vm/unibrow-inl.h"
21
22#if !defined(DART_PRECOMPILED_RUNTIME)
24#endif // !defined(DART_PRECOMPILED_RUNTIME)
25
26#define Z (zone())
27
28namespace dart {
29
30// Default to generating optimized regexp code.
31static constexpr bool kRegexpOptimization = true;
32
33// More makes code generation slower, less makes V8 benchmark score lower.
34static constexpr intptr_t kMaxLookaheadForBoyerMoore = 8;
35
37 const int32_t* ranges,
38 intptr_t ranges_length,
39 Interval new_range) {
40 ASSERT((ranges_length & 1) == 1);
41 ASSERT(ranges[ranges_length - 1] == Utf::kMaxCodePoint + 1);
42 if (containment == kLatticeUnknown) return containment;
43 bool inside = false;
44 int32_t last = 0;
45 for (intptr_t i = 0; i < ranges_length;
46 inside = !inside, last = ranges[i], i++) {
47 // Consider the range from last to ranges[i].
48 // We haven't got to the new range yet.
49 if (ranges[i] <= new_range.from()) continue;
50 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
51 // inclusive, but the values in ranges are not.
52 if (last <= new_range.from() && new_range.to() < ranges[i]) {
53 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
54 }
55 return kLatticeUnknown;
56 }
57 return containment;
58}
59
60// -------------------------------------------------------------------
61// Implementation of the Irregexp regular expression engine.
62//
63// The Irregexp regular expression engine is intended to be a complete
64// implementation of ECMAScript regular expressions. It generates
65// IR code that is subsequently compiled to native code.
66
67// The Irregexp regexp engine is structured in three steps.
68// 1) The parser generates an abstract syntax tree. See regexp_ast.cc.
69// 2) From the AST a node network is created. The nodes are all
70// subclasses of RegExpNode. The nodes represent states when
71// executing a regular expression. Several optimizations are
72// performed on the node network.
73// 3) From the nodes we generate IR instructions that can actually
74// execute the regular expression (perform the search). The
75// code generation step is described in more detail below.
76
77// Code generation.
78//
79// The nodes are divided into four main categories.
80// * Choice nodes
81// These represent places where the regular expression can
82// match in more than one way. For example on entry to an
83// alternation (foo|bar) or a repetition (*, +, ? or {}).
84// * Action nodes
85// These represent places where some action should be
86// performed. Examples include recording the current position
87// in the input string to a register (in order to implement
88// captures) or other actions on register for example in order
89// to implement the counters needed for {} repetitions.
90// * Matching nodes
91// These attempt to match some element part of the input string.
92// Examples of elements include character classes, plain strings
93// or back references.
94// * End nodes
95// These are used to implement the actions required on finding
96// a successful match or failing to find a match.
97//
98// The code generated maintains some state as it runs. This consists of the
99// following elements:
100//
101// * The capture registers. Used for string captures.
102// * Other registers. Used for counters etc.
103// * The current position.
104// * The stack of backtracking information. Used when a matching node
105// fails to find a match and needs to try an alternative.
106//
107// Conceptual regular expression execution model:
108//
109// There is a simple conceptual model of regular expression execution
110// which will be presented first. The actual code generated is a more
111// efficient simulation of the simple conceptual model:
112//
113// * Choice nodes are implemented as follows:
114// For each choice except the last {
115// push current position
116// push backtrack code location
117// <generate code to test for choice>
118// backtrack code location:
119// pop current position
120// }
121// <generate code to test for last choice>
122//
123// * Actions nodes are generated as follows
124// <push affected registers on backtrack stack>
125// <generate code to perform action>
126// push backtrack code location
127// <generate code to test for following nodes>
128// backtrack code location:
129// <pop affected registers to restore their state>
130// <pop backtrack location from stack and go to it>
131//
132// * Matching nodes are generated as follows:
133// if input string matches at current position
134// update current position
135// <generate code to test for following nodes>
136// else
137// <pop backtrack location from stack and go to it>
138//
139// Thus it can be seen that the current position is saved and restored
140// by the choice nodes, whereas the registers are saved and restored by
141// by the action nodes that manipulate them.
142//
143// The other interesting aspect of this model is that nodes are generated
144// at the point where they are needed by a recursive call to Emit(). If
145// the node has already been code generated then the Emit() call will
146// generate a jump to the previously generated code instead. In order to
147// limit recursion it is possible for the Emit() function to put the node
148// on a work list for later generation and instead generate a jump. The
149// destination of the jump is resolved later when the code is generated.
150//
151// Actual regular expression code generation.
152//
153// Code generation is actually more complicated than the above. In order
154// to improve the efficiency of the generated code some optimizations are
155// performed
156//
157// * Choice nodes have 1-character lookahead.
158// A choice node looks at the following character and eliminates some of
159// the choices immediately based on that character. This is not yet
160// implemented.
161// * Simple greedy loops store reduced backtracking information.
162// A quantifier like /.*foo/m will greedily match the whole input. It will
163// then need to backtrack to a point where it can match "foo". The naive
164// implementation of this would push each character position onto the
165// backtracking stack, then pop them off one by one. This would use space
166// proportional to the length of the input string. However since the "."
167// can only match in one way and always has a constant length (in this case
168// of 1) it suffices to store the current position on the top of the stack
169// once. Matching now becomes merely incrementing the current position and
170// backtracking becomes decrementing the current position and checking the
171// result against the stored current position. This is faster and saves
172// space.
173// * The current state is virtualized.
174// This is used to defer expensive operations until it is clear that they
175// are needed and to generate code for a node more than once, allowing
176// specialized an efficient versions of the code to be created. This is
177// explained in the section below.
178//
179// Execution state virtualization.
180//
181// Instead of emitting code, nodes that manipulate the state can record their
182// manipulation in an object called the Trace. The Trace object can record a
183// current position offset, an optional backtrack code location on the top of
184// the virtualized backtrack stack and some register changes. When a node is
185// to be emitted it can flush the Trace or update it. Flushing the Trace
186// will emit code to bring the actual state into line with the virtual state.
187// Avoiding flushing the state can postpone some work (e.g. updates of capture
188// registers). Postponing work can save time when executing the regular
189// expression since it may be found that the work never has to be done as a
190// failure to match can occur. In addition it is much faster to jump to a
191// known backtrack code location than it is to pop an unknown backtrack
192// location from the stack and jump there.
193//
194// The virtual state found in the Trace affects code generation. For example
195// the virtual state contains the difference between the actual current
196// position and the virtual current position, and matching code needs to use
197// this offset to attempt a match in the correct location of the input
198// string. Therefore code generated for a non-trivial trace is specialized
199// to that trace. The code generator therefore has the ability to generate
200// code for each node several times. In order to limit the size of the
201// generated code there is an arbitrary limit on how many specialized sets of
202// code may be generated for a given node. If the limit is reached, the
203// trace is flushed and a generic version of the code for a node is emitted.
204// This is subsequently used for that node. The code emitted for non-generic
205// trace is not recorded in the node and so it cannot currently be reused in
206// the event that code generation is requested for an identical trace.
207
209 UNREACHABLE();
210}
211
213 text->AddElement(TextElement::Atom(this));
214}
215
217 text->AddElement(TextElement::CharClass(this));
218}
219
221 for (intptr_t i = 0; i < elements()->length(); i++)
222 text->AddElement((*elements())[i]);
223}
224
226 return TextElement(ATOM, atom);
227}
228
231}
232
233intptr_t TextElement::length() const {
234 switch (text_type()) {
235 case ATOM:
236 return atom()->length();
237
238 case CHAR_CLASS:
239 return 1;
240 }
241 UNREACHABLE();
242 return 0;
243}
244
246 public:
247 FrequencyCollator() : total_samples_(0) {
248 for (intptr_t i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
249 frequencies_[i] = CharacterFrequency(i);
250 }
251 }
252
253 void CountCharacter(intptr_t character) {
254 intptr_t index = (character & RegExpMacroAssembler::kTableMask);
255 frequencies_[index].Increment();
256 total_samples_++;
257 }
258
259 // Does not measure in percent, but rather per-128 (the table size from the
260 // regexp macro assembler).
261 intptr_t Frequency(intptr_t in_character) {
262 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
263 if (total_samples_ < 1) return 1; // Division by zero.
264 intptr_t freq_in_per128 =
265 (frequencies_[in_character].counter() * 128) / total_samples_;
266 return freq_in_per128;
267 }
268
269 private:
270 class CharacterFrequency {
271 public:
272 CharacterFrequency() : counter_(0), character_(-1) {}
273 explicit CharacterFrequency(intptr_t character)
274 : counter_(0), character_(character) {}
275
276 void Increment() { counter_++; }
277 intptr_t counter() { return counter_; }
278 intptr_t character() { return character_; }
279
280 private:
281 intptr_t counter_;
282 intptr_t character_;
283
285 };
286
287 private:
288 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
289 intptr_t total_samples_;
290};
291
293 public:
294 RegExpCompiler(intptr_t capture_count, bool is_one_byte);
295
296 intptr_t AllocateRegister() { return next_register_++; }
297
298 // Lookarounds to match lone surrogates for unicode character class matches
299 // are never nested. We can therefore reuse registers.
301 if (unicode_lookaround_stack_register_ == kNoRegister) {
302 unicode_lookaround_stack_register_ = AllocateRegister();
303 }
304 return unicode_lookaround_stack_register_;
305 }
306
308 if (unicode_lookaround_position_register_ == kNoRegister) {
309 unicode_lookaround_position_register_ = AllocateRegister();
310 }
311 return unicode_lookaround_position_register_;
312 }
313
314#if !defined(DART_PRECOMPILED_RUNTIME)
316 RegExpNode* start,
317 intptr_t capture_count,
318 const String& pattern);
319#endif
320
323 RegExpNode* start,
324 intptr_t capture_count,
325 const String& pattern);
326
327 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
328
329 static constexpr intptr_t kImplementationOffset = 0;
330 static constexpr intptr_t kNumberOfRegistersOffset = 0;
331 static constexpr intptr_t kCodeOffset = 1;
332
333 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
334 EndNode* accept() { return accept_; }
335
336 static constexpr intptr_t kMaxRecursion = 100;
337 inline intptr_t recursion_depth() { return recursion_depth_; }
338 inline void IncrementRecursionDepth() { recursion_depth_++; }
339 inline void DecrementRecursionDepth() { recursion_depth_--; }
340
341 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
342
343 inline bool one_byte() const { return is_one_byte_; }
344 bool read_backward() { return read_backward_; }
345 void set_read_backward(bool value) { read_backward_ = value; }
346 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
347
348 intptr_t current_expansion_factor() { return current_expansion_factor_; }
350 current_expansion_factor_ = value;
351 }
352
353 Zone* zone() const { return zone_; }
354
355 static constexpr intptr_t kNoRegister = -1;
356
357 private:
358 EndNode* accept_;
359 intptr_t next_register_;
360 intptr_t unicode_lookaround_stack_register_;
361 intptr_t unicode_lookaround_position_register_;
363 intptr_t recursion_depth_;
364 RegExpMacroAssembler* macro_assembler_;
365 bool is_one_byte_;
366 bool reg_exp_too_big_;
367 bool read_backward_;
368 intptr_t current_expansion_factor_;
369 FrequencyCollator frequency_collator_;
370 Zone* zone_;
371};
372
374 public:
376 compiler->IncrementRecursionDepth();
377 }
379
380 private:
381 RegExpCompiler* compiler_;
382};
383
385 return RegExpEngine::CompilationResult("RegExp too big");
386}
387
388// Attempts to compile the regexp using an Irregexp code generator. Returns
389// a fixed array or a null handle depending on whether it succeeded.
390RegExpCompiler::RegExpCompiler(intptr_t capture_count, bool is_one_byte)
391 : next_register_(2 * (capture_count + 1)),
392 unicode_lookaround_stack_register_(kNoRegister),
393 unicode_lookaround_position_register_(kNoRegister),
394 work_list_(nullptr),
395 recursion_depth_(0),
396 is_one_byte_(is_one_byte),
397 reg_exp_too_big_(false),
398 read_backward_(false),
399 current_expansion_factor_(1),
400 zone_(Thread::Current()->zone()) {
401 accept_ = new (Z) EndNode(EndNode::ACCEPT, Z);
402}
403
404#if !defined(DART_PRECOMPILED_RUNTIME)
406 IRRegExpMacroAssembler* macro_assembler,
407 RegExpNode* start,
408 intptr_t capture_count,
409 const String& pattern) {
410 macro_assembler->set_slow_safe(false /* use_slow_safe_regexp_compiler */);
411 macro_assembler_ = macro_assembler;
412
414 work_list_ = &work_list;
416 macro_assembler_->PushBacktrack(&fail);
417 Trace new_trace;
418 start->Emit(this, &new_trace);
419 macro_assembler_->BindBlock(&fail);
420 macro_assembler_->Fail();
421 while (!work_list.is_empty()) {
422 work_list.RemoveLast()->Emit(this, &new_trace);
423 }
424 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
425
426 macro_assembler->GenerateBacktrackBlock();
427 macro_assembler->FinalizeRegistersArray();
428
430 macro_assembler->backtrack_goto(), macro_assembler->graph_entry(),
431 macro_assembler->num_blocks(), macro_assembler->num_stack_locals(),
432 next_register_);
433}
434#endif
435
437 BytecodeRegExpMacroAssembler* macro_assembler,
438 RegExpNode* start,
439 intptr_t capture_count,
440 const String& pattern) {
441 macro_assembler->set_slow_safe(false /* use_slow_safe_regexp_compiler */);
442 macro_assembler_ = macro_assembler;
443
445 work_list_ = &work_list;
447 macro_assembler_->PushBacktrack(&fail);
448 Trace new_trace;
449 start->Emit(this, &new_trace);
450 macro_assembler_->BindBlock(&fail);
451 macro_assembler_->Fail();
452 while (!work_list.is_empty()) {
453 work_list.RemoveLast()->Emit(this, &new_trace);
454 }
455 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
456
457 TypedData& bytecode = TypedData::ZoneHandle(macro_assembler->GetBytecode());
458 return RegExpEngine::CompilationResult(&bytecode, next_register_);
459}
460
463 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
464 return range.Contains(that);
465 } else {
466 return reg() == that;
467 }
468}
469
470bool Trace::mentions_reg(intptr_t reg) {
471 for (DeferredAction* action = actions_; action != nullptr;
472 action = action->next()) {
473 if (action->Mentions(reg)) return true;
474 }
475 return false;
476}
477
478bool Trace::GetStoredPosition(intptr_t reg, intptr_t* cp_offset) {
479 ASSERT(*cp_offset == 0);
480 for (DeferredAction* action = actions_; action != nullptr;
481 action = action->next()) {
482 if (action->Mentions(reg)) {
483 if (action->action_type() == ActionNode::STORE_POSITION) {
484 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
485 return true;
486 } else {
487 return false;
488 }
489 }
490 }
491 return false;
492}
493
494// This is called as we come into a loop choice node and some other tricky
495// nodes. It normalizes the state of the code generator to ensure we can
496// generate generic code.
497intptr_t Trace::FindAffectedRegisters(OutSet* affected_registers, Zone* zone) {
498 intptr_t max_register = RegExpCompiler::kNoRegister;
499 for (DeferredAction* action = actions_; action != nullptr;
500 action = action->next()) {
501 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
502 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
503 for (intptr_t i = range.from(); i <= range.to(); i++)
504 affected_registers->Set(i, zone);
505 if (range.to() > max_register) max_register = range.to();
506 } else {
507 affected_registers->Set(action->reg(), zone);
508 if (action->reg() > max_register) max_register = action->reg();
509 }
510 }
511 return max_register;
512}
513
514void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
515 intptr_t max_register,
516 const OutSet& registers_to_pop,
517 const OutSet& registers_to_clear) {
518 for (intptr_t reg = max_register; reg >= 0; reg--) {
519 if (registers_to_pop.Get(reg)) {
520 assembler->PopRegister(reg);
521 } else if (registers_to_clear.Get(reg)) {
522 intptr_t clear_to = reg;
523 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
524 reg--;
525 }
526 assembler->ClearRegisters(reg, clear_to);
527 }
528 }
529}
530
531void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
532 intptr_t max_register,
533 const OutSet& affected_registers,
534 OutSet* registers_to_pop,
535 OutSet* registers_to_clear,
536 Zone* zone) {
537 for (intptr_t reg = 0; reg <= max_register; reg++) {
538 if (!affected_registers.Get(reg)) {
539 continue;
540 }
541
542 // The chronologically first deferred action in the trace
543 // is used to infer the action needed to restore a register
544 // to its previous state (or not, if it's safe to ignore it).
545 enum DeferredActionUndoType { ACTION_IGNORE, ACTION_RESTORE, ACTION_CLEAR };
546 DeferredActionUndoType undo_action = ACTION_IGNORE;
547
548 intptr_t value = 0;
549 bool absolute = false;
550 bool clear = false;
551 const intptr_t kNoStore = kMinInt32;
552 intptr_t store_position = kNoStore;
553 // This is a little tricky because we are scanning the actions in reverse
554 // historical order (newest first).
555 for (DeferredAction* action = actions_; action != nullptr;
556 action = action->next()) {
557 if (action->Mentions(reg)) {
558 switch (action->action_type()) {
560 Trace::DeferredSetRegister* psr =
561 static_cast<Trace::DeferredSetRegister*>(action);
562 if (!absolute) {
563 value += psr->value();
564 absolute = true;
565 }
566 // SET_REGISTER is currently only used for newly introduced loop
567 // counters. They can have a significant previous value if they
568 // occur in a loop. TODO(lrn): Propagate this information, so we
569 // can set undo_action to ACTION_IGNORE if we know there is no
570 // value to restore.
571 undo_action = ACTION_RESTORE;
572 ASSERT(store_position == kNoStore);
573 ASSERT(!clear);
574 break;
575 }
577 if (!absolute) {
578 value++;
579 }
580 ASSERT(store_position == kNoStore);
581 ASSERT(!clear);
582 undo_action = ACTION_RESTORE;
583 break;
585 Trace::DeferredCapture* pc =
586 static_cast<Trace::DeferredCapture*>(action);
587 if (!clear && store_position == kNoStore) {
588 store_position = pc->cp_offset();
589 }
590
591 // For captures we know that stores and clears alternate.
592 // Other register, are never cleared, and if the occur
593 // inside a loop, they might be assigned more than once.
594 if (reg <= 1) {
595 // Registers zero and one, aka "capture zero", is
596 // always set correctly if we succeed. There is no
597 // need to undo a setting on backtrack, because we
598 // will set it again or fail.
599 undo_action = ACTION_IGNORE;
600 } else {
601 undo_action = pc->is_capture() ? ACTION_CLEAR : ACTION_RESTORE;
602 }
603 ASSERT(!absolute);
604 ASSERT(value == 0);
605 break;
606 }
608 // Since we're scanning in reverse order, if we've already
609 // set the position we have to ignore historically earlier
610 // clearing operations.
611 if (store_position == kNoStore) {
612 clear = true;
613 }
614 undo_action = ACTION_RESTORE;
615 ASSERT(!absolute);
616 ASSERT(value == 0);
617 break;
618 }
619 default:
620 UNREACHABLE();
621 break;
622 }
623 }
624 }
625 // Prepare for the undo-action (e.g., push if it's going to be popped).
626 if (undo_action == ACTION_RESTORE) {
627 assembler->PushRegister(reg);
628 registers_to_pop->Set(reg, zone);
629 } else if (undo_action == ACTION_CLEAR) {
630 registers_to_clear->Set(reg, zone);
631 }
632 // Perform the chronologically last action (or accumulated increment)
633 // for the register.
634 if (store_position != kNoStore) {
635 assembler->WriteCurrentPositionToRegister(reg, store_position);
636 } else if (clear) {
637 assembler->ClearRegisters(reg, reg);
638 } else if (absolute) {
639 assembler->SetRegister(reg, value);
640 } else if (value != 0) {
641 assembler->AdvanceRegister(reg, value);
642 }
643 }
644}
645
646// This is called as we come into a loop choice node and some other tricky
647// nodes. It normalizes the state of the code generator to ensure we can
648// generate generic code.
650 RegExpMacroAssembler* assembler = compiler->macro_assembler();
651
652 ASSERT(!is_trivial());
653
654 if (actions_ == nullptr && backtrack() == nullptr) {
655 // Here we just have some deferred cp advances to fix and we are back to
656 // a normal situation. We may also have to forget some information gained
657 // through a quick check that was already performed.
658 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
659 // Create a new trivial state and generate the node with that.
660 Trace new_state;
661 successor->Emit(compiler, &new_state);
662 return;
663 }
664
665 // Generate deferred actions here along with code to undo them again.
666 OutSet affected_registers;
667
668 if (backtrack() != nullptr) {
669 // Here we have a concrete backtrack location. These are set up by choice
670 // nodes and so they indicate that we have a deferred save of the current
671 // position which we may need to emit here.
672 assembler->PushCurrentPosition();
673 }
674 Zone* zone = successor->zone();
675 intptr_t max_register = FindAffectedRegisters(&affected_registers, zone);
676 OutSet registers_to_pop;
677 OutSet registers_to_clear;
678 PerformDeferredActions(assembler, max_register, affected_registers,
679 &registers_to_pop, &registers_to_clear, zone);
680 if (cp_offset_ != 0) {
681 assembler->AdvanceCurrentPosition(cp_offset_);
682 }
683
684 // Create a new trivial state and generate the node with that.
685 BlockLabel undo;
686 assembler->PushBacktrack(&undo);
687 Trace new_state;
688 successor->Emit(compiler, &new_state);
689
690 // On backtrack we need to restore state.
691 assembler->BindBlock(&undo);
692 RestoreAffectedRegisters(assembler, max_register, registers_to_pop,
693 registers_to_clear);
694 if (backtrack() == nullptr) {
695 assembler->Backtrack();
696 } else {
697 assembler->PopCurrentPosition();
698 assembler->GoTo(backtrack());
699 }
700}
701
703 RegExpMacroAssembler* assembler = compiler->macro_assembler();
704
705 // Omit flushing the trace. We discard the entire stack frame anyway.
706
707 if (!label()->is_bound()) {
708 // We are completely independent of the trace, since we ignore it,
709 // so this code can be used as the generic version.
710 assembler->BindBlock(label());
711 }
712
713 // Throw away everything on the backtrack stack since the start
714 // of the negative submatch and restore the character position.
715 assembler->ReadCurrentPositionFromRegister(current_position_register_);
716 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
717 if (clear_capture_count_ > 0) {
718 // Clear any captures that might have been performed during the success
719 // of the body of the negative look-ahead.
720 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
721 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
722 }
723 // Now that we have unwound the stack we find at the top of the stack the
724 // backtrack that the BeginSubmatch node got.
725 assembler->Backtrack();
726}
727
729 if (!trace->is_trivial()) {
730 trace->Flush(compiler, this);
731 return;
732 }
733 RegExpMacroAssembler* assembler = compiler->macro_assembler();
734 if (!label()->is_bound()) {
735 assembler->BindBlock(label());
736 }
737 switch (action_) {
738 case ACCEPT:
739 assembler->Succeed();
740 return;
741 case BACKTRACK:
742 assembler->GoTo(trace->backtrack());
743 return;
744 case NEGATIVE_SUBMATCH_SUCCESS:
745 // This case is handled in a different virtual method.
746 UNREACHABLE();
747 }
749}
750
752 if (guards_ == nullptr) guards_ = new (zone) ZoneGrowableArray<Guard*>(1);
753 guards_->Add(guard);
754}
755
757 intptr_t val,
758 RegExpNode* on_success) {
760 new (on_success->zone()) ActionNode(SET_REGISTER, on_success);
761 result->data_.u_store_register.reg = reg;
762 result->data_.u_store_register.value = val;
763 return result;
764}
765
767 RegExpNode* on_success) {
769 new (on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
770 result->data_.u_increment_register.reg = reg;
771 return result;
772}
773
775 bool is_capture,
776 RegExpNode* on_success) {
778 new (on_success->zone()) ActionNode(STORE_POSITION, on_success);
779 result->data_.u_position_register.reg = reg;
780 result->data_.u_position_register.is_capture = is_capture;
781 return result;
782}
783
786 new (on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
787 result->data_.u_clear_captures.range_from = range.from();
788 result->data_.u_clear_captures.range_to = range.to();
789 return result;
790}
791
793 intptr_t position_reg,
794 RegExpNode* on_success) {
796 new (on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
797 result->data_.u_submatch.stack_pointer_register = stack_reg;
798 result->data_.u_submatch.current_position_register = position_reg;
799 return result;
800}
801
803 intptr_t position_reg,
804 intptr_t clear_register_count,
805 intptr_t clear_register_from,
806 RegExpNode* on_success) {
807 ActionNode* result = new (on_success->zone())
808 ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
809 result->data_.u_submatch.stack_pointer_register = stack_reg;
810 result->data_.u_submatch.current_position_register = position_reg;
811 result->data_.u_submatch.clear_register_count = clear_register_count;
812 result->data_.u_submatch.clear_register_from = clear_register_from;
813 return result;
814}
815
817 intptr_t repetition_register,
818 intptr_t repetition_limit,
819 RegExpNode* on_success) {
821 new (on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
822 result->data_.u_empty_match_check.start_register = start_register;
823 result->data_.u_empty_match_check.repetition_register = repetition_register;
824 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
825 return result;
826}
827
828#define DEFINE_ACCEPT(Type) \
829 void Type##Node::Accept(NodeVisitor* visitor) { \
830 visitor->Visit##Type(this); \
831 }
833#undef DEFINE_ACCEPT
834
836 visitor->VisitLoopChoice(this);
837}
838
839// -------------------------------------------------------------------
840// Emit code.
841
842void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
843 Guard* guard,
844 Trace* trace) {
845 switch (guard->op()) {
846 case Guard::LT:
847 ASSERT(!trace->mentions_reg(guard->reg()));
848 macro_assembler->IfRegisterGE(guard->reg(), guard->value(),
849 trace->backtrack());
850 break;
851 case Guard::GEQ:
852 ASSERT(!trace->mentions_reg(guard->reg()));
853 macro_assembler->IfRegisterLT(guard->reg(), guard->value(),
854 trace->backtrack());
855 break;
856 }
857}
858
859// Returns the number of characters in the equivalence class, omitting those
860// that cannot occur in the source string because it is ASCII.
861static intptr_t GetCaseIndependentLetters(uint16_t character,
862 bool one_byte_subject,
863 int32_t* letters) {
865 intptr_t length = jsregexp_uncanonicalize.get(character, '\0', letters);
866 // Unibrow returns 0 or 1 for characters where case independence is
867 // trivial.
868 if (length == 0) {
869 letters[0] = character;
870 length = 1;
871 }
872 if (!one_byte_subject || character <= Symbols::kMaxOneCharCodeSymbol) {
873 return length;
874 }
875
876 // The standard requires that non-ASCII characters cannot have ASCII
877 // character codes in their equivalence class.
878 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
879 // is it? For example, \u00C5 is equivalent to \u212B.
880 return 0;
881}
882
883static inline bool EmitSimpleCharacter(Zone* zone,
885 uint16_t c,
886 BlockLabel* on_failure,
887 intptr_t cp_offset,
888 bool check,
889 bool preloaded) {
890 RegExpMacroAssembler* assembler = compiler->macro_assembler();
891 bool bound_checked = false;
892 if (!preloaded) {
893 assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
894 bound_checked = true;
895 }
896 assembler->CheckNotCharacter(c, on_failure);
897 return bound_checked;
898}
899
900// Only emits non-letters (things that don't have case). Only used for case
901// independent matches.
902static inline bool EmitAtomNonLetter(Zone* zone,
904 uint16_t c,
905 BlockLabel* on_failure,
906 intptr_t cp_offset,
907 bool check,
908 bool preloaded) {
909 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
910 bool one_byte = compiler->one_byte();
912 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
913 if (length < 1) {
914 // This can't match. Must be an one-byte subject and a non-one-byte
915 // character. We do not need to do anything since the one-byte pass
916 // already handled this.
917 return false; // Bounds not checked.
918 }
919 bool checked = false;
920 // We handle the length > 1 case in a later pass.
921 if (length == 1) {
922 if (one_byte && c > Symbols::kMaxOneCharCodeSymbol) {
923 // Can't match - see above.
924 return false; // Bounds not checked.
925 }
926 if (!preloaded) {
927 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
928 checked = check;
929 }
930 macro_assembler->CheckNotCharacter(c, on_failure);
931 }
932 return checked;
933}
934
936 bool one_byte,
937 uint16_t c1,
938 uint16_t c2,
939 BlockLabel* on_failure) {
940 uint16_t char_mask;
941 if (one_byte) {
943 } else {
944 char_mask = Utf16::kMaxCodeUnit;
945 }
946 uint16_t exor = c1 ^ c2;
947 // Check whether exor has only one bit set.
948 if (((exor - 1) & exor) == 0) {
949 // If c1 and c2 differ only by one bit.
950 // Ecma262UnCanonicalize always gives the highest number last.
951 ASSERT(c2 > c1);
952 uint16_t mask = char_mask ^ exor;
953 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
954 return true;
955 }
956 ASSERT(c2 > c1);
957 uint16_t diff = c2 - c1;
958 if (((diff - 1) & diff) == 0 && c1 >= diff) {
959 // If the characters differ by 2^n but don't differ by one bit then
960 // subtract the difference from the found character, then do the or
961 // trick. We avoid the theoretical case where negative numbers are
962 // involved in order to simplify code generation.
963 uint16_t mask = char_mask ^ diff;
964 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff, diff, mask,
965 on_failure);
966 return true;
967 }
968 return false;
969}
970
971typedef bool EmitCharacterFunction(Zone* zone,
973 uint16_t c,
974 BlockLabel* on_failure,
975 intptr_t cp_offset,
976 bool check,
977 bool preloaded);
978
979// Only emits letters (things that have case). Only used for case independent
980// matches.
981static inline bool EmitAtomLetter(Zone* zone,
983 uint16_t c,
984 BlockLabel* on_failure,
985 intptr_t cp_offset,
986 bool check,
987 bool preloaded) {
988 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
989 bool one_byte = compiler->one_byte();
991 intptr_t length = GetCaseIndependentLetters(c, one_byte, chars);
992 if (length <= 1) return false;
993 // We may not need to check against the end of the input string
994 // if this character lies before a character that matched.
995 if (!preloaded) {
996 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
997 }
1000 switch (length) {
1001 case 2: {
1002 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1003 chars[1], on_failure)) {
1004 } else {
1005 macro_assembler->CheckCharacter(chars[0], &ok);
1006 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1007 macro_assembler->BindBlock(&ok);
1008 }
1009 break;
1010 }
1011 case 4:
1012 macro_assembler->CheckCharacter(chars[3], &ok);
1014 case 3:
1015 macro_assembler->CheckCharacter(chars[0], &ok);
1016 macro_assembler->CheckCharacter(chars[1], &ok);
1017 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1018 macro_assembler->BindBlock(&ok);
1019 break;
1020 default:
1021 UNREACHABLE();
1022 break;
1023 }
1024 return true;
1025}
1026
1028 uint16_t border,
1029 BlockLabel* fall_through,
1030 BlockLabel* above_or_equal,
1031 BlockLabel* below) {
1032 if (below != fall_through) {
1033 masm->CheckCharacterLT(border, below);
1034 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1035 } else {
1036 masm->CheckCharacterGT(border - 1, above_or_equal);
1037 }
1038}
1039
1041 uint16_t first,
1042 uint16_t last,
1043 BlockLabel* fall_through,
1044 BlockLabel* in_range,
1045 BlockLabel* out_of_range) {
1046 if (in_range == fall_through) {
1047 if (first == last) {
1048 masm->CheckNotCharacter(first, out_of_range);
1049 } else {
1050 masm->CheckCharacterNotInRange(first, last, out_of_range);
1051 }
1052 } else {
1053 if (first == last) {
1054 masm->CheckCharacter(first, in_range);
1055 } else {
1056 masm->CheckCharacterInRange(first, last, in_range);
1057 }
1058 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1059 }
1060}
1061
1062// even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1063// odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1066 intptr_t start_index,
1067 intptr_t end_index,
1068 uint16_t min_char,
1069 BlockLabel* fall_through,
1070 BlockLabel* even_label,
1071 BlockLabel* odd_label) {
1072 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1073 const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1074
1075 intptr_t base = (min_char & ~kMask);
1076
1077 // Assert that everything is on one kTableSize page.
1078 for (intptr_t i = start_index; i <= end_index; i++) {
1079 ASSERT((ranges->At(i) & ~kMask) == base);
1080 }
1081 ASSERT(start_index == 0 || (ranges->At(start_index - 1) & ~kMask) <= base);
1082
1083 char templ[kSize];
1084 BlockLabel* on_bit_set;
1085 BlockLabel* on_bit_clear;
1086 intptr_t bit;
1087 if (even_label == fall_through) {
1088 on_bit_set = odd_label;
1089 on_bit_clear = even_label;
1090 bit = 1;
1091 } else {
1092 on_bit_set = even_label;
1093 on_bit_clear = odd_label;
1094 bit = 0;
1095 }
1096 for (intptr_t i = 0; i < (ranges->At(start_index) & kMask) && i < kSize;
1097 i++) {
1098 templ[i] = bit;
1099 }
1100 intptr_t j = 0;
1101 bit ^= 1;
1102 for (intptr_t i = start_index; i < end_index; i++) {
1103 for (j = (ranges->At(i) & kMask); j < (ranges->At(i + 1) & kMask); j++) {
1104 templ[j] = bit;
1105 }
1106 bit ^= 1;
1107 }
1108 for (intptr_t i = j; i < kSize; i++) {
1109 templ[i] = bit;
1110 }
1111 // TODO(erikcorry): Cache these.
1112 const TypedData& ba = TypedData::ZoneHandle(
1113 masm->zone(), TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
1114 for (intptr_t i = 0; i < kSize; i++) {
1115 ba.SetUint8(i, templ[i]);
1116 }
1117 masm->CheckBitInTable(ba, on_bit_set);
1118 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1119}
1120
1123 intptr_t start_index,
1124 intptr_t end_index,
1125 intptr_t cut_index,
1126 BlockLabel* even_label,
1127 BlockLabel* odd_label) {
1128 bool odd = (((cut_index - start_index) & 1) == 1);
1129 BlockLabel* in_range_label = odd ? odd_label : even_label;
1130 BlockLabel dummy;
1131 EmitDoubleBoundaryTest(masm, ranges->At(cut_index),
1132 ranges->At(cut_index + 1) - 1, &dummy, in_range_label,
1133 &dummy);
1134 ASSERT(!dummy.is_linked());
1135 // Cut out the single range by rewriting the array. This creates a new
1136 // range that is a merger of the two ranges on either side of the one we
1137 // are cutting out. The oddity of the labels is preserved.
1138 for (intptr_t j = cut_index; j > start_index; j--) {
1139 (*ranges)[j] = ranges->At(j - 1);
1140 }
1141 for (intptr_t j = cut_index + 1; j < end_index; j++) {
1142 (*ranges)[j] = ranges->At(j + 1);
1143 }
1144}
1145
1146// Unicode case. Split the search space into kSize spaces that are handled
1147// with recursion.
1149 intptr_t start_index,
1150 intptr_t end_index,
1151 intptr_t* new_start_index,
1152 intptr_t* new_end_index,
1153 uint16_t* border) {
1154 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
1155 const intptr_t kMask = RegExpMacroAssembler::kTableMask;
1156
1157 uint16_t first = ranges->At(start_index);
1158 uint16_t last = ranges->At(end_index) - 1;
1159
1160 *new_start_index = start_index;
1161 *border = (ranges->At(start_index) & ~kMask) + kSize;
1162 while (*new_start_index < end_index) {
1163 if (ranges->At(*new_start_index) > *border) break;
1164 (*new_start_index)++;
1165 }
1166 // new_start_index is the index of the first edge that is beyond the
1167 // current kSize space.
1168
1169 // For very large search spaces we do a binary chop search of the non-Latin1
1170 // space instead of just going to the end of the current kSize space. The
1171 // heuristics are complicated a little by the fact that any 128-character
1172 // encoding space can be quickly tested with a table lookup, so we don't
1173 // wish to do binary chop search at a smaller granularity than that. A
1174 // 128-character space can take up a lot of space in the ranges array if,
1175 // for example, we only want to match every second character (eg. the lower
1176 // case characters on some Unicode pages).
1177 intptr_t binary_chop_index = (end_index + start_index) / 2;
1178 // The first test ensures that we get to the code that handles the Latin1
1179 // range with a single not-taken branch, speeding up this important
1180 // character range (even non-Latin1 charset-based text has spaces and
1181 // punctuation).
1182 if (*border - 1 > Symbols::kMaxOneCharCodeSymbol && // Latin1 case.
1183 end_index - start_index > (*new_start_index - start_index) * 2 &&
1184 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1185 ranges->At(binary_chop_index) >= first + 2 * kSize) {
1186 intptr_t scan_forward_for_section_border = binary_chop_index;
1187 intptr_t new_border = (ranges->At(binary_chop_index) | kMask) + 1;
1188
1189 while (scan_forward_for_section_border < end_index) {
1190 if (ranges->At(scan_forward_for_section_border) > new_border) {
1191 *new_start_index = scan_forward_for_section_border;
1192 *border = new_border;
1193 break;
1194 }
1195 scan_forward_for_section_border++;
1196 }
1197 }
1198
1199 ASSERT(*new_start_index > start_index);
1200 *new_end_index = *new_start_index - 1;
1201 if (ranges->At(*new_end_index) == *border) {
1202 (*new_end_index)--;
1203 }
1204 if (*border >= ranges->At(end_index)) {
1205 *border = ranges->At(end_index);
1206 *new_start_index = end_index; // Won't be used.
1207 *new_end_index = end_index - 1;
1208 }
1209}
1210
1211// Gets a series of segment boundaries representing a character class. If the
1212// character is in the range between an even and an odd boundary (counting from
1213// start_index) then go to even_label, otherwise go to odd_label. We already
1214// know that the character is in the range of min_char to max_char inclusive.
1215// Either label can be null indicating backtracking. Either label can also be
1216// equal to the fall_through label.
1219 intptr_t start_index,
1220 intptr_t end_index,
1221 uint16_t min_char,
1222 uint16_t max_char,
1223 BlockLabel* fall_through,
1224 BlockLabel* even_label,
1225 BlockLabel* odd_label) {
1226 uint16_t first = ranges->At(start_index);
1227 uint16_t last = ranges->At(end_index) - 1;
1228
1229 ASSERT(min_char < first);
1230
1231 // Just need to test if the character is before or on-or-after
1232 // a particular character.
1233 if (start_index == end_index) {
1234 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1235 return;
1236 }
1237
1238 // Another almost trivial case: There is one interval in the middle that is
1239 // different from the end intervals.
1240 if (start_index + 1 == end_index) {
1241 EmitDoubleBoundaryTest(masm, first, last, fall_through, even_label,
1242 odd_label);
1243 return;
1244 }
1245
1246 // It's not worth using table lookup if there are very few intervals in the
1247 // character class.
1248 if (end_index - start_index <= 6) {
1249 // It is faster to test for individual characters, so we look for those
1250 // first, then try arbitrary ranges in the second round.
1251 static intptr_t kNoCutIndex = -1;
1252 intptr_t cut = kNoCutIndex;
1253 for (intptr_t i = start_index; i < end_index; i++) {
1254 if (ranges->At(i) == ranges->At(i + 1) - 1) {
1255 cut = i;
1256 break;
1257 }
1258 }
1259 if (cut == kNoCutIndex) cut = start_index;
1260 CutOutRange(masm, ranges, start_index, end_index, cut, even_label,
1261 odd_label);
1262 ASSERT(end_index - start_index >= 2);
1263 GenerateBranches(masm, ranges, start_index + 1, end_index - 1, min_char,
1264 max_char, fall_through, even_label, odd_label);
1265 return;
1266 }
1267
1268 // If there are a lot of intervals in the regexp, then we will use tables to
1269 // determine whether the character is inside or outside the character class.
1271
1272 if ((max_char >> kBits) == (min_char >> kBits)) {
1273 EmitUseLookupTable(masm, ranges, start_index, end_index, min_char,
1274 fall_through, even_label, odd_label);
1275 return;
1276 }
1277
1278 if ((min_char >> kBits) != (first >> kBits)) {
1279 masm->CheckCharacterLT(first, odd_label);
1280 GenerateBranches(masm, ranges, start_index + 1, end_index, first, max_char,
1281 fall_through, odd_label, even_label);
1282 return;
1283 }
1284
1285 intptr_t new_start_index = 0;
1286 intptr_t new_end_index = 0;
1287 uint16_t border = 0;
1288
1289 SplitSearchSpace(ranges, start_index, end_index, &new_start_index,
1290 &new_end_index, &border);
1291
1292 BlockLabel handle_rest;
1293 BlockLabel* above = &handle_rest;
1294 if (border == last + 1) {
1295 // We didn't find any section that started after the limit, so everything
1296 // above the border is one of the terminal labels.
1297 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
1298 ASSERT(new_end_index == end_index - 1);
1299 }
1300
1301 ASSERT(start_index <= new_end_index);
1302 ASSERT(new_start_index <= end_index);
1303 ASSERT(start_index < new_start_index);
1304 ASSERT(new_end_index < end_index);
1305 ASSERT(new_end_index + 1 == new_start_index ||
1306 (new_end_index + 2 == new_start_index &&
1307 border == ranges->At(new_end_index + 1)));
1308 ASSERT(min_char < border - 1);
1309 ASSERT(border < max_char);
1310 ASSERT(ranges->At(new_end_index) < border);
1311 ASSERT(border < ranges->At(new_start_index) ||
1312 (border == ranges->At(new_start_index) &&
1313 new_start_index == end_index && new_end_index == end_index - 1 &&
1314 border == last + 1));
1315 ASSERT(new_start_index == 0 || border >= ranges->At(new_start_index - 1));
1316
1317 masm->CheckCharacterGT(border - 1, above);
1318 BlockLabel dummy;
1319 GenerateBranches(masm, ranges, start_index, new_end_index, min_char,
1320 border - 1, &dummy, even_label, odd_label);
1321
1322 if (handle_rest.is_linked()) {
1323 masm->BindBlock(&handle_rest);
1324 bool flip = (new_start_index & 1) != (start_index & 1);
1325 GenerateBranches(masm, ranges, new_start_index, end_index, border, max_char,
1326 &dummy, flip ? odd_label : even_label,
1327 flip ? even_label : odd_label);
1328 }
1329}
1330
1331static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1333 bool one_byte,
1334 BlockLabel* on_failure,
1335 intptr_t cp_offset,
1336 bool check_offset,
1337 bool preloaded,
1338 Zone* zone) {
1339 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
1340 if (!CharacterRange::IsCanonical(ranges)) {
1342 }
1343
1344 uint16_t max_char;
1345 if (one_byte) {
1347 } else {
1348 max_char = Utf16::kMaxCodeUnit;
1349 }
1350
1351 intptr_t range_count = ranges->length();
1352
1353 intptr_t last_valid_range = range_count - 1;
1354 while (last_valid_range >= 0) {
1355 const CharacterRange& range = ranges->At(last_valid_range);
1356 if (range.from() <= max_char) {
1357 break;
1358 }
1359 last_valid_range--;
1360 }
1361
1362 if (last_valid_range < 0) {
1363 if (!cc->is_negated()) {
1364 macro_assembler->GoTo(on_failure);
1365 }
1366 if (check_offset) {
1367 macro_assembler->CheckPosition(cp_offset, on_failure);
1368 }
1369 return;
1370 }
1371
1372 if (last_valid_range == 0 && ranges->At(0).IsEverything(max_char)) {
1373 if (cc->is_negated()) {
1374 macro_assembler->GoTo(on_failure);
1375 } else {
1376 // This is a common case hit by non-anchored expressions.
1377 if (check_offset) {
1378 macro_assembler->CheckPosition(cp_offset, on_failure);
1379 }
1380 }
1381 return;
1382 }
1383
1384 if (!preloaded) {
1385 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1386 }
1387
1388 if (cc->is_standard() && macro_assembler->CheckSpecialCharacterClass(
1389 cc->standard_type(), on_failure)) {
1390 return;
1391 }
1392
1393 // A new list with ascending entries. Each entry is a code unit
1394 // where there is a boundary between code units that are part of
1395 // the class and code units that are not. Normally we insert an
1396 // entry at zero which goes to the failure label, but if there
1397 // was already one there we fall through for success on that entry.
1398 // Subsequent entries have alternating meaning (success/failure).
1399 ZoneGrowableArray<uint16_t>* range_boundaries =
1400 new (zone) ZoneGrowableArray<uint16_t>(last_valid_range);
1401
1402 bool zeroth_entry_is_failure = !cc->is_negated();
1403
1404 for (intptr_t i = 0; i <= last_valid_range; i++) {
1405 const CharacterRange& range = ranges->At(i);
1406 if (range.from() == 0) {
1407 ASSERT(i == 0);
1408 zeroth_entry_is_failure = !zeroth_entry_is_failure;
1409 } else {
1410 range_boundaries->Add(range.from());
1411 }
1412 if (range.to() + 1 <= max_char) {
1413 range_boundaries->Add(range.to() + 1);
1414 }
1415 }
1416 intptr_t end_index = range_boundaries->length() - 1;
1417
1418 BlockLabel fall_through;
1419 GenerateBranches(macro_assembler, range_boundaries,
1420 0, // start_index.
1421 end_index,
1422 0, // min_char.
1423 max_char, &fall_through,
1424 zeroth_entry_is_failure ? &fall_through : on_failure,
1425 zeroth_entry_is_failure ? on_failure : &fall_through);
1426 macro_assembler->BindBlock(&fall_through);
1427}
1428
1430
1432 Trace* trace) {
1433 // If we are generating a greedy loop then don't stop and don't reuse code.
1434 if (trace->stop_node() != nullptr) {
1435 return CONTINUE;
1436 }
1437
1438 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1439 if (trace->is_trivial()) {
1440 if (label_.is_bound()) {
1441 // We are being asked to generate a generic version, but that's already
1442 // been done so just go to it.
1443 macro_assembler->GoTo(&label_);
1444 return DONE;
1445 }
1446 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1447 // To avoid too deep recursion we push the node to the work queue and just
1448 // generate a goto here.
1449 compiler->AddWork(this);
1450 macro_assembler->GoTo(&label_);
1451 return DONE;
1452 }
1453 // Generate generic version of the node and bind the label for later use.
1454 macro_assembler->BindBlock(&label_);
1455 return CONTINUE;
1456 }
1457
1458 // We are being asked to make a non-generic version. Keep track of how many
1459 // non-generic versions we generate so as not to overdo it.
1460 trace_count_++;
1461 if (kRegexpOptimization && trace_count_ < kMaxCopiesCodeGenerated &&
1462 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1463 return CONTINUE;
1464 }
1465
1466 // If we get here code has been generated for this node too many times or
1467 // recursion is too deep. Time to switch to a generic version. The code for
1468 // generic versions above can handle deep recursion properly.
1469 trace->Flush(compiler, this);
1470 return DONE;
1471}
1472
1473intptr_t ActionNode::EatsAtLeast(intptr_t still_to_find,
1474 intptr_t budget,
1475 bool not_at_start) {
1476 if (budget <= 0) return 0;
1477 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1478 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1479}
1480
1482 intptr_t budget,
1484 bool not_at_start) {
1485 if (action_type_ == BEGIN_SUBMATCH) {
1486 bm->SetRest(offset);
1487 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
1488 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1489 }
1490 SaveBMInfo(bm, not_at_start, offset);
1491}
1492
1493intptr_t AssertionNode::EatsAtLeast(intptr_t still_to_find,
1494 intptr_t budget,
1495 bool not_at_start) {
1496 if (budget <= 0) return 0;
1497 // If we know we are not at the start and we are asked "how many characters
1498 // will you match if you succeed?" then we can answer anything since false
1499 // implies false. So lets just return the max answer (still_to_find) since
1500 // that won't prevent us from preloading a lot of characters for the other
1501 // branches in the node graph.
1502 if (assertion_type() == AT_START && not_at_start) return still_to_find;
1503 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1504}
1505
1507 intptr_t budget,
1509 bool not_at_start) {
1510 // Match the behaviour of EatsAtLeast on this node.
1511 if (assertion_type() == AT_START && not_at_start) return;
1512 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
1513 SaveBMInfo(bm, not_at_start, offset);
1514}
1515
1516intptr_t BackReferenceNode::EatsAtLeast(intptr_t still_to_find,
1517 intptr_t budget,
1518 bool not_at_start) {
1519 if (read_backward()) return 0;
1520 if (budget <= 0) return 0;
1521 return on_success()->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1522}
1523
1524intptr_t TextNode::EatsAtLeast(intptr_t still_to_find,
1525 intptr_t budget,
1526 bool not_at_start) {
1527 if (read_backward()) return 0;
1528 intptr_t answer = Length();
1529 if (answer >= still_to_find) return answer;
1530 if (budget <= 0) return answer;
1531 // We are not at start after this node so we set the last argument to 'true'.
1532 return answer +
1533 on_success()->EatsAtLeast(still_to_find - answer, budget - 1, true);
1534}
1535
1536intptr_t NegativeLookaroundChoiceNode::EatsAtLeast(intptr_t still_to_find,
1537 intptr_t budget,
1538 bool not_at_start) {
1539 if (budget <= 0) return 0;
1540 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1541 // afterwards.
1542 RegExpNode* node = (*alternatives_)[1].node();
1543 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
1544}
1545
1547 QuickCheckDetails* details,
1549 intptr_t filled_in,
1550 bool not_at_start) {
1551 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1552 // afterwards.
1553 RegExpNode* node = (*alternatives_)[1].node();
1554 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1555}
1556
1557intptr_t ChoiceNode::EatsAtLeastHelper(intptr_t still_to_find,
1558 intptr_t budget,
1559 RegExpNode* ignore_this_node,
1560 bool not_at_start) {
1561 if (budget <= 0) return 0;
1562 intptr_t min = 100;
1563 intptr_t choice_count = alternatives_->length();
1564 budget = (budget - 1) / choice_count;
1565 for (intptr_t i = 0; i < choice_count; i++) {
1566 RegExpNode* node = (*alternatives_)[i].node();
1567 if (node == ignore_this_node) continue;
1568 intptr_t node_eats_at_least =
1569 node->EatsAtLeast(still_to_find, budget, not_at_start);
1570 if (node_eats_at_least < min) min = node_eats_at_least;
1571 if (min == 0) return 0;
1572 }
1573 return min;
1574}
1575
1576intptr_t LoopChoiceNode::EatsAtLeast(intptr_t still_to_find,
1577 intptr_t budget,
1578 bool not_at_start) {
1579 return EatsAtLeastHelper(still_to_find, budget - 1, loop_node_, not_at_start);
1580}
1581
1582intptr_t ChoiceNode::EatsAtLeast(intptr_t still_to_find,
1583 intptr_t budget,
1584 bool not_at_start) {
1585 return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
1586}
1587
1588// Takes the left-most 1-bit and smears it out, setting all bits to its right.
1589static inline uint32_t SmearBitsRight(uint32_t v) {
1590 v |= v >> 1;
1591 v |= v >> 2;
1592 v |= v >> 4;
1593 v |= v >> 8;
1594 v |= v >> 16;
1595 return v;
1596}
1597
1599 bool found_useful_op = false;
1600 uint32_t char_mask;
1601 if (asc) {
1603 } else {
1604 char_mask = Utf16::kMaxCodeUnit;
1605 }
1606 mask_ = 0;
1607 value_ = 0;
1608 intptr_t char_shift = 0;
1609 for (intptr_t i = 0; i < characters_; i++) {
1610 Position* pos = &positions_[i];
1611 if ((pos->mask & Symbols::kMaxOneCharCodeSymbol) != 0) {
1612 found_useful_op = true;
1613 }
1614 mask_ |= (pos->mask & char_mask) << char_shift;
1615 value_ |= (pos->value & char_mask) << char_shift;
1616 char_shift += asc ? 8 : 16;
1617 }
1618 return found_useful_op;
1619}
1620
1622 Trace* bounds_check_trace,
1623 Trace* trace,
1624 bool preload_has_checked_bounds,
1625 BlockLabel* on_possible_success,
1626 QuickCheckDetails* details,
1627 bool fall_through_on_failure) {
1628 if (details->characters() == 0) return false;
1629 GetQuickCheckDetails(details, compiler, 0,
1630 trace->at_start() == Trace::FALSE_VALUE);
1631 if (details->cannot_match()) return false;
1632 if (!details->Rationalize(compiler->one_byte())) return false;
1633 ASSERT(details->characters() == 1 ||
1634 compiler->macro_assembler()->CanReadUnaligned());
1635 uint32_t mask = details->mask();
1636 uint32_t value = details->value();
1637
1638 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1639
1640 if (trace->characters_preloaded() != details->characters()) {
1641 ASSERT(trace->cp_offset() == bounds_check_trace->cp_offset());
1642 // We are attempting to preload the minimum number of characters
1643 // any choice would eat, so if the bounds check fails, then none of the
1644 // choices can succeed, so we can just immediately backtrack, rather
1645 // than go to the next choice.
1646 assembler->LoadCurrentCharacter(
1647 trace->cp_offset(), bounds_check_trace->backtrack(),
1648 !preload_has_checked_bounds, details->characters());
1649 }
1650
1651 bool need_mask = true;
1652
1653 if (details->characters() == 1) {
1654 // If number of characters preloaded is 1 then we used a byte or 16 bit
1655 // load so the value is already masked down.
1656 uint32_t char_mask;
1657 if (compiler->one_byte()) {
1659 } else {
1660 char_mask = Utf16::kMaxCodeUnit;
1661 }
1662 if ((mask & char_mask) == char_mask) need_mask = false;
1663 mask &= char_mask;
1664 } else {
1665 // For 2-character preloads in one-byte mode or 1-character preloads in
1666 // two-byte mode we also use a 16 bit load with zero extend.
1667 if (details->characters() == 2 && compiler->one_byte()) {
1668 if ((mask & 0xffff) == 0xffff) need_mask = false;
1669 } else if (details->characters() == 1 && !compiler->one_byte()) {
1670 if ((mask & 0xffff) == 0xffff) need_mask = false;
1671 } else {
1672 if (mask == 0xffffffff) need_mask = false;
1673 }
1674 }
1675
1676 if (fall_through_on_failure) {
1677 if (need_mask) {
1678 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1679 } else {
1680 assembler->CheckCharacter(value, on_possible_success);
1681 }
1682 } else {
1683 if (need_mask) {
1684 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1685 } else {
1686 assembler->CheckNotCharacter(value, trace->backtrack());
1687 }
1688 }
1689 return true;
1690}
1691
1692// Here is the meat of GetQuickCheckDetails (see also the comment on the
1693// super-class in the .h file).
1694//
1695// We iterate along the text object, building up for each character a
1696// mask and value that can be used to test for a quick failure to match.
1697// The masks and values for the positions will be combined into a single
1698// machine word for the current character width in order to be used in
1699// generating a quick check.
1702 intptr_t characters_filled_in,
1703 bool not_at_start) {
1704#if defined(__GNUC__) && defined(__BYTE_ORDER__)
1705 // TODO(zerny): Make the combination code byte-order independent.
1706 ASSERT(details->characters() == 1 ||
1707 (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__));
1708#endif
1709 // Do not collect any quick check details if the text node reads backward,
1710 // since it reads in the opposite direction than we use for quick checks.
1711 if (read_backward()) return;
1712 ASSERT(characters_filled_in < details->characters());
1713 intptr_t characters = details->characters();
1714 int32_t char_mask;
1715 if (compiler->one_byte()) {
1717 } else {
1718 char_mask = Utf16::kMaxCodeUnit;
1719 }
1720 for (intptr_t k = 0; k < elms_->length(); k++) {
1721 TextElement elm = elms_->At(k);
1722 if (elm.text_type() == TextElement::ATOM) {
1723 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1724 for (intptr_t i = 0; i < characters && i < quarks->length(); i++) {
1726 details->positions(characters_filled_in);
1727 uint16_t c = quarks->At(i);
1728 if (c > char_mask) {
1729 // If we expect a non-Latin1 character from an one-byte string,
1730 // there is no way we can match. Not even case independent
1731 // matching can turn an Latin1 character into non-Latin1 or
1732 // vice versa.
1733 // TODO(dcarney): issue 3550. Verify that this works as expected.
1734 // For example, \u0178 is uppercase of \u00ff (y-umlaut).
1735 details->set_cannot_match();
1736 pos->determines_perfectly = false;
1737 return;
1738 }
1739 if (elm.atom()->ignore_case()) {
1741 intptr_t length =
1742 GetCaseIndependentLetters(c, compiler->one_byte(), chars);
1743 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1744 if (length == 1) {
1745 // This letter has no case equivalents, so it's nice and simple
1746 // and the mask-compare will determine definitely whether we have
1747 // a match at this character position.
1748 pos->mask = char_mask;
1749 pos->value = c;
1750 pos->determines_perfectly = true;
1751 } else {
1752 uint32_t common_bits = char_mask;
1753 uint32_t bits = chars[0];
1754 for (intptr_t j = 1; j < length; j++) {
1755 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1756 common_bits ^= differing_bits;
1757 bits &= common_bits;
1758 }
1759 // If length is 2 and common bits has only one zero in it then
1760 // our mask and compare instruction will determine definitely
1761 // whether we have a match at this character position. Otherwise
1762 // it can only be an approximate check.
1763 uint32_t one_zero = (common_bits | ~char_mask);
1764 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1765 pos->determines_perfectly = true;
1766 }
1767 pos->mask = common_bits;
1768 pos->value = bits;
1769 }
1770 } else {
1771 // Don't ignore case. Nice simple case where the mask-compare will
1772 // determine definitely whether we have a match at this character
1773 // position.
1774 pos->mask = char_mask;
1775 pos->value = c;
1776 pos->determines_perfectly = true;
1777 }
1778 characters_filled_in++;
1779 ASSERT(characters_filled_in <= details->characters());
1780 if (characters_filled_in == details->characters()) {
1781 return;
1782 }
1783 }
1784 } else {
1786 details->positions(characters_filled_in);
1787 RegExpCharacterClass* tree = elm.char_class();
1788 ZoneGrowableArray<CharacterRange>* ranges = tree->ranges();
1789 ASSERT(!ranges->is_empty());
1790 if (!CharacterRange::IsCanonical(ranges)) {
1792 }
1793 if (tree->is_negated()) {
1794 // A quick check uses multi-character mask and compare. There is no
1795 // useful way to incorporate a negative char class into this scheme
1796 // so we just conservatively create a mask and value that will always
1797 // succeed.
1798 pos->mask = 0;
1799 pos->value = 0;
1800 } else {
1801 intptr_t first_range = 0;
1802 while (ranges->At(first_range).from() > char_mask) {
1803 first_range++;
1804 if (first_range == ranges->length()) {
1805 details->set_cannot_match();
1806 pos->determines_perfectly = false;
1807 return;
1808 }
1809 }
1810 CharacterRange range = ranges->At(first_range);
1811 uint16_t from = range.from();
1812 uint16_t to = range.to();
1813 if (to > char_mask) {
1814 to = char_mask;
1815 }
1816 uint32_t differing_bits = (from ^ to);
1817 // A mask and compare is only perfect if the differing bits form a
1818 // number like 00011111 with one single block of trailing 1s.
1819 if ((differing_bits & (differing_bits + 1)) == 0 &&
1820 from + differing_bits == to) {
1821 pos->determines_perfectly = true;
1822 }
1823 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1824 uint32_t bits = (from & common_bits);
1825 for (intptr_t i = first_range + 1; i < ranges->length(); i++) {
1826 CharacterRange range = ranges->At(i);
1827 uint16_t from = range.from();
1828 uint16_t to = range.to();
1829 if (from > char_mask) continue;
1830 if (to > char_mask) to = char_mask;
1831 // Here we are combining more ranges into the mask and compare
1832 // value. With each new range the mask becomes more sparse and
1833 // so the chances of a false positive rise. A character class
1834 // with multiple ranges is assumed never to be equivalent to a
1835 // mask and compare operation.
1836 pos->determines_perfectly = false;
1837 uint32_t new_common_bits = (from ^ to);
1838 new_common_bits = ~SmearBitsRight(new_common_bits);
1839 common_bits &= new_common_bits;
1840 bits &= new_common_bits;
1841 uint32_t differing_bits = (from & common_bits) ^ bits;
1842 common_bits ^= differing_bits;
1843 bits &= common_bits;
1844 }
1845 pos->mask = common_bits;
1846 pos->value = bits;
1847 }
1848 characters_filled_in++;
1849 ASSERT(characters_filled_in <= details->characters());
1850 if (characters_filled_in == details->characters()) {
1851 return;
1852 }
1853 }
1854 }
1855 ASSERT(characters_filled_in != details->characters());
1856 if (!details->cannot_match()) {
1857 on_success()->GetQuickCheckDetails(details, compiler, characters_filled_in,
1858 true);
1859 }
1860}
1861
1863 for (int i = 0; i < characters_; i++) {
1864 positions_[i].mask = 0;
1865 positions_[i].value = 0;
1866 positions_[i].determines_perfectly = false;
1867 }
1868 characters_ = 0;
1869}
1870
1871void QuickCheckDetails::Advance(intptr_t by, bool one_byte) {
1872 if (by >= characters_ || by < 0) {
1873 // check that by < 0 => characters_ == 0
1874 ASSERT(by >= 0 || characters_ == 0);
1875 Clear();
1876 return;
1877 }
1878 for (intptr_t i = 0; i < characters_ - by; i++) {
1879 positions_[i] = positions_[by + i];
1880 }
1881 for (intptr_t i = characters_ - by; i < characters_; i++) {
1882 positions_[i].mask = 0;
1883 positions_[i].value = 0;
1884 positions_[i].determines_perfectly = false;
1885 }
1886 characters_ -= by;
1887 // We could change mask_ and value_ here but we would never advance unless
1888 // they had already been used in a check and they won't be used again because
1889 // it would gain us nothing. So there's no point.
1890}
1891
1892void QuickCheckDetails::Merge(QuickCheckDetails* other, intptr_t from_index) {
1893 ASSERT(characters_ == other->characters_);
1894 if (other->cannot_match_) {
1895 return;
1896 }
1897 if (cannot_match_) {
1898 *this = *other;
1899 return;
1900 }
1901 for (intptr_t i = from_index; i < characters_; i++) {
1902 QuickCheckDetails::Position* pos = positions(i);
1903 QuickCheckDetails::Position* other_pos = other->positions(i);
1904 if (pos->mask != other_pos->mask || pos->value != other_pos->value ||
1905 !other_pos->determines_perfectly) {
1906 // Our mask-compare operation will be approximate unless we have the
1907 // exact same operation on both sides of the alternation.
1908 pos->determines_perfectly = false;
1909 }
1910 pos->mask &= other_pos->mask;
1911 pos->value &= pos->mask;
1912 other_pos->value &= pos->mask;
1913 uint16_t differing_bits = (pos->value ^ other_pos->value);
1914 pos->mask &= ~differing_bits;
1915 pos->value &= pos->mask;
1916 }
1917}
1918
1919class VisitMarker : public ValueObject {
1920 public:
1921 explicit VisitMarker(NodeInfo* info) : info_(info) {
1922 ASSERT(!info->visited);
1923 info->visited = true;
1924 }
1925 ~VisitMarker() { info_->visited = false; }
1926
1927 private:
1928 NodeInfo* info_;
1929};
1930
1932 if (info()->replacement_calculated) return replacement();
1933 if (depth < 0) return this;
1934 ASSERT(!info()->visited);
1936 return FilterSuccessor(depth - 1);
1937}
1938
1940 RegExpNode* next = on_success_->FilterOneByte(depth - 1);
1941 if (next == nullptr) return set_replacement(nullptr);
1942 on_success_ = next;
1943 return set_replacement(this);
1944}
1945
1946// We need to check for the following characters: 0x39c 0x3bc 0x178.
1948 // TODO(dcarney): this could be a lot more efficient.
1949 return range.Contains(0x39c) || range.Contains(0x3bc) ||
1950 range.Contains(0x178);
1951}
1952
1955 for (intptr_t i = 0; i < ranges->length(); i++) {
1956 // TODO(dcarney): this could be a lot more efficient.
1957 if (RangeContainsLatin1Equivalents(ranges->At(i))) return true;
1958 }
1959 return false;
1960}
1961
1962static uint16_t ConvertNonLatin1ToLatin1(uint16_t c) {
1964 switch (c) {
1965 // This are equivalent characters in unicode.
1966 case 0x39c:
1967 case 0x3bc:
1968 return 0xb5;
1969 // This is an uppercase of a Latin-1 character
1970 // outside of Latin-1.
1971 case 0x178:
1972 return 0xff;
1973 }
1974 return 0;
1975}
1976
1978 if (info()->replacement_calculated) return replacement();
1979 if (depth < 0) return this;
1980 ASSERT(!info()->visited);
1982 intptr_t element_count = elms_->length();
1983 for (intptr_t i = 0; i < element_count; i++) {
1984 TextElement elm = elms_->At(i);
1985 if (elm.text_type() == TextElement::ATOM) {
1986 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
1987 for (intptr_t j = 0; j < quarks->length(); j++) {
1988 uint16_t c = quarks->At(j);
1989 if (c <= Symbols::kMaxOneCharCodeSymbol) continue;
1990 if (!elm.atom()->ignore_case()) return set_replacement(nullptr);
1991 // Here, we need to check for characters whose upper and lower cases
1992 // are outside the Latin-1 range.
1993 uint16_t converted = ConvertNonLatin1ToLatin1(c);
1994 // Character is outside Latin-1 completely
1995 if (converted == 0) return set_replacement(nullptr);
1996 // Convert quark to Latin-1 in place.
1997 (*quarks)[0] = converted;
1998 }
1999 } else {
2002 ZoneGrowableArray<CharacterRange>* ranges = cc->ranges();
2003 if (!CharacterRange::IsCanonical(ranges)) {
2005 }
2006 // Now they are in order so we only need to look at the first.
2007 intptr_t range_count = ranges->length();
2008 if (cc->is_negated()) {
2009 if (range_count != 0 && ranges->At(0).from() == 0 &&
2010 ranges->At(0).to() >= Symbols::kMaxOneCharCodeSymbol) {
2011 // This will be handled in a later filter.
2012 if (cc->flags().IgnoreCase() &&
2014 continue;
2015 }
2016 return set_replacement(nullptr);
2017 }
2018 } else {
2019 if (range_count == 0 ||
2020 ranges->At(0).from() > Symbols::kMaxOneCharCodeSymbol) {
2021 // This will be handled in a later filter.
2022 if (cc->flags().IgnoreCase() &&
2024 continue;
2025 return set_replacement(nullptr);
2026 }
2027 }
2028 }
2029 }
2030 return FilterSuccessor(depth - 1);
2031}
2032
2034 if (info()->replacement_calculated) return replacement();
2035 if (depth < 0) return this;
2036 if (info()->visited) return this;
2037 {
2039
2040 RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2041 // If we can't continue after the loop then there is no sense in doing the
2042 // loop.
2043 if (continue_replacement == nullptr) return set_replacement(nullptr);
2044 }
2045
2046 return ChoiceNode::FilterOneByte(depth - 1);
2047}
2048
2050 if (info()->replacement_calculated) return replacement();
2051 if (depth < 0) return this;
2052 if (info()->visited) return this;
2054 intptr_t choice_count = alternatives_->length();
2055
2056 for (intptr_t i = 0; i < choice_count; i++) {
2057 GuardedAlternative alternative = alternatives_->At(i);
2058 if (alternative.guards() != nullptr &&
2059 alternative.guards()->length() != 0) {
2060 set_replacement(this);
2061 return this;
2062 }
2063 }
2064
2065 intptr_t surviving = 0;
2066 RegExpNode* survivor = nullptr;
2067 for (intptr_t i = 0; i < choice_count; i++) {
2068 GuardedAlternative alternative = alternatives_->At(i);
2069 RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2070 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2071 if (replacement != nullptr) {
2072 (*alternatives_)[i].set_node(replacement);
2073 surviving++;
2074 survivor = replacement;
2075 }
2076 }
2077 if (surviving < 2) return set_replacement(survivor);
2078
2079 set_replacement(this);
2080 if (surviving == choice_count) {
2081 return this;
2082 }
2083 // Only some of the nodes survived the filtering. We need to rebuild the
2084 // alternatives list.
2085 ZoneGrowableArray<GuardedAlternative>* new_alternatives =
2087 for (intptr_t i = 0; i < choice_count; i++) {
2088 RegExpNode* replacement =
2089 (*alternatives_)[i].node()->FilterOneByte(depth - 1);
2090 if (replacement != nullptr) {
2091 (*alternatives_)[i].set_node(replacement);
2092 new_alternatives->Add((*alternatives_)[i]);
2093 }
2094 }
2095 alternatives_ = new_alternatives;
2096 return this;
2097}
2098
2100 if (info()->replacement_calculated) return replacement();
2101 if (depth < 0) return this;
2102 if (info()->visited) return this;
2104 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2105 // afterwards.
2106 RegExpNode* node = (*alternatives_)[1].node();
2107 RegExpNode* replacement = node->FilterOneByte(depth - 1);
2108 if (replacement == nullptr) return set_replacement(nullptr);
2109 (*alternatives_)[1].set_node(replacement);
2110
2111 RegExpNode* neg_node = (*alternatives_)[0].node();
2112 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2113 // If the negative lookahead is always going to fail then
2114 // we don't need to check it.
2115 if (neg_replacement == nullptr) return set_replacement(replacement);
2116 (*alternatives_)[0].set_node(neg_replacement);
2117 return set_replacement(this);
2118}
2119
2122 intptr_t characters_filled_in,
2123 bool not_at_start) {
2124 if (body_can_be_zero_length_ || info()->visited) return;
2127 characters_filled_in, not_at_start);
2128}
2129
2131 intptr_t budget,
2133 bool not_at_start) {
2134 if (body_can_be_zero_length_ || budget <= 0) {
2135 bm->SetRest(offset);
2136 SaveBMInfo(bm, not_at_start, offset);
2137 return;
2138 }
2139 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2140 SaveBMInfo(bm, not_at_start, offset);
2141}
2142
2145 intptr_t characters_filled_in,
2146 bool not_at_start) {
2147 not_at_start = (not_at_start || not_at_start_);
2148 intptr_t choice_count = alternatives_->length();
2149 ASSERT(choice_count > 0);
2150 (*alternatives_)[0].node()->GetQuickCheckDetails(
2151 details, compiler, characters_filled_in, not_at_start);
2152 for (intptr_t i = 1; i < choice_count; i++) {
2153 QuickCheckDetails new_details(details->characters());
2154 RegExpNode* node = (*alternatives_)[i].node();
2155 node->GetQuickCheckDetails(&new_details, compiler, characters_filled_in,
2156 not_at_start);
2157 // Here we merge the quick match details of the two branches.
2158 details->Merge(&new_details, characters_filled_in);
2159 }
2160}
2161
2162// Check for [0-9A-Z_a-z].
2165 BlockLabel* non_word,
2166 bool fall_through_on_word) {
2167 if (assembler->CheckSpecialCharacterClass(
2168 fall_through_on_word ? 'w' : 'W',
2169 fall_through_on_word ? non_word : word)) {
2170 // Optimized implementation available.
2171 return;
2172 }
2173 assembler->CheckCharacterGT('z', non_word);
2174 assembler->CheckCharacterLT('0', non_word);
2175 assembler->CheckCharacterGT('a' - 1, word);
2176 assembler->CheckCharacterLT('9' + 1, word);
2177 assembler->CheckCharacterLT('A', non_word);
2178 assembler->CheckCharacterLT('Z' + 1, word);
2179 if (fall_through_on_word) {
2180 assembler->CheckNotCharacter('_', non_word);
2181 } else {
2182 assembler->CheckCharacter('_', word);
2183 }
2184}
2185
2186// Emit the code to check for a ^ in multiline mode (1-character lookbehind
2187// that matches newline or the start of input).
2189 RegExpNode* on_success,
2190 Trace* trace) {
2191 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2192 // We will be loading the previous character into the current character
2193 // register.
2194 Trace new_trace(*trace);
2195 new_trace.InvalidateCurrentCharacter();
2196
2197 BlockLabel ok;
2198 if (new_trace.cp_offset() == 0) {
2199 // The start of input counts as a newline in this context, so skip to
2200 // ok if we are at the start.
2201 assembler->CheckAtStart(&ok);
2202 }
2203 // We already checked that we are not at the start of input so it must be
2204 // OK to load the previous character.
2205 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2206 new_trace.backtrack(), false);
2207 if (!assembler->CheckSpecialCharacterClass('n', new_trace.backtrack())) {
2208 // Newline means \n, \r, 0x2028 or 0x2029.
2209 if (!compiler->one_byte()) {
2210 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2211 }
2212 assembler->CheckCharacter('\n', &ok);
2213 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2214 }
2215 assembler->BindBlock(&ok);
2216 on_success->Emit(compiler, &new_trace);
2217}
2218
2219// Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2220void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2221 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2222 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2223 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2224 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2225 if (lookahead == nullptr) {
2226 intptr_t eats_at_least =
2228 EatsAtLeast(kMaxLookaheadForBoyerMoore, kRecursionBudget,
2229 not_at_start));
2230 if (eats_at_least >= 1) {
2231 BoyerMooreLookahead* bm =
2232 new (Z) BoyerMooreLookahead(eats_at_least, compiler, Z);
2233 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
2234 if (bm->at(0)->is_non_word()) next_is_word_character = Trace::FALSE_VALUE;
2235 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2236 }
2237 } else {
2238 if (lookahead->at(0)->is_non_word())
2239 next_is_word_character = Trace::FALSE_VALUE;
2240 if (lookahead->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2241 }
2242 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2243 if (next_is_word_character == Trace::UNKNOWN) {
2244 BlockLabel before_non_word;
2245 BlockLabel before_word;
2246 if (trace->characters_preloaded() != 1) {
2247 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2248 }
2249 // Fall through on non-word.
2250 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2251 // Next character is not a word character.
2252 assembler->BindBlock(&before_non_word);
2253 BlockLabel ok;
2254 // Backtrack on \B (non-boundary check) if previous is a word,
2255 // since we know next *is not* a word and this would be a boundary.
2256 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2257
2258 if (!assembler->IsClosed()) {
2259 assembler->GoTo(&ok);
2260 }
2261
2262 assembler->BindBlock(&before_word);
2263 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2264 assembler->BindBlock(&ok);
2265 } else if (next_is_word_character == Trace::TRUE_VALUE) {
2266 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2267 } else {
2268 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
2269 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2270 }
2271}
2272
2273void AssertionNode::BacktrackIfPrevious(
2274 RegExpCompiler* compiler,
2275 Trace* trace,
2276 AssertionNode::IfPrevious backtrack_if_previous) {
2277 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2278 Trace new_trace(*trace);
2279 new_trace.InvalidateCurrentCharacter();
2280
2281 BlockLabel fall_through, dummy;
2282
2283 BlockLabel* non_word = backtrack_if_previous == kIsNonWord
2284 ? new_trace.backtrack()
2285 : &fall_through;
2286 BlockLabel* word = backtrack_if_previous == kIsNonWord
2287 ? &fall_through
2288 : new_trace.backtrack();
2289
2290 if (new_trace.cp_offset() == 0) {
2291 // The start of input counts as a non-word character, so the question is
2292 // decided if we are at the start.
2293 assembler->CheckAtStart(non_word);
2294 }
2295 // We already checked that we are not at the start of input so it must be
2296 // OK to load the previous character.
2297 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
2298 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
2299
2300 assembler->BindBlock(&fall_through);
2301 on_success()->Emit(compiler, &new_trace);
2302}
2303
2306 intptr_t filled_in,
2307 bool not_at_start) {
2308 if (assertion_type_ == AT_START && not_at_start) {
2309 details->set_cannot_match();
2310 return;
2311 }
2312 return on_success()->GetQuickCheckDetails(details, compiler, filled_in,
2313 not_at_start);
2314}
2315
2317 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2318 switch (assertion_type_) {
2319 case AT_END: {
2320 BlockLabel ok;
2321 assembler->CheckPosition(trace->cp_offset(), &ok);
2322 assembler->GoTo(trace->backtrack());
2323 assembler->BindBlock(&ok);
2324 break;
2325 }
2326 case AT_START: {
2327 if (trace->at_start() == Trace::FALSE_VALUE) {
2328 assembler->GoTo(trace->backtrack());
2329 return;
2330 }
2331 if (trace->at_start() == Trace::UNKNOWN) {
2332 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
2333 Trace at_start_trace = *trace;
2334 at_start_trace.set_at_start(Trace::TRUE_VALUE);
2335 on_success()->Emit(compiler, &at_start_trace);
2336 return;
2337 }
2338 } break;
2339 case AFTER_NEWLINE:
2340 EmitHat(compiler, on_success(), trace);
2341 return;
2342 case AT_BOUNDARY:
2343 case AT_NON_BOUNDARY: {
2344 EmitBoundaryCheck(compiler, trace);
2345 return;
2346 }
2347 }
2348 on_success()->Emit(compiler, trace);
2349}
2350
2351static bool DeterminedAlready(QuickCheckDetails* quick_check, intptr_t offset) {
2352 if (quick_check == nullptr) return false;
2353 if (offset >= quick_check->characters()) return false;
2354 return quick_check->positions(offset)->determines_perfectly;
2355}
2356
2357static void UpdateBoundsCheck(intptr_t index, intptr_t* checked_up_to) {
2358 if (index > *checked_up_to) {
2359 *checked_up_to = index;
2360 }
2361}
2362
2363// We call this repeatedly to generate code for each pass over the text node.
2364// The passes are in increasing order of difficulty because we hope one
2365// of the first passes will fail in which case we are saved the work of the
2366// later passes. for example for the case independent regexp /%[asdfghjkl]a/
2367// we will check the '%' in the first pass, the case independent 'a' in the
2368// second pass and the character class in the last pass.
2369//
2370// The passes are done from right to left, so for example to test for /bar/
2371// we will first test for an 'r' with offset 2, then an 'a' with offset 1
2372// and then a 'b' with offset 0. This means we can avoid the end-of-input
2373// bounds check most of the time. In the example we only need to check for
2374// end-of-input when loading the putative 'r'.
2375//
2376// A slight complication involves the fact that the first character may already
2377// be fetched into a register by the previous node. In this case we want to
2378// do the test for that character first. We do this in separate passes. The
2379// 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2380// pass has been performed then subsequent passes will have true in
2381// first_element_checked to indicate that character does not need to be
2382// checked again.
2383//
2384// In addition to all this we are passed a Trace, which can
2385// contain an AlternativeGeneration object. In this AlternativeGeneration
2386// object we can see details of any quick check that was already passed in
2387// order to get to the code we are now generating. The quick check can involve
2388// loading characters, which means we do not need to recheck the bounds
2389// up to the limit the quick check already checked. In addition the quick
2390// check can have involved a mask and compare operation which may simplify
2391// or obviate the need for further checks at some character positions.
2392void TextNode::TextEmitPass(RegExpCompiler* compiler,
2393 TextEmitPassType pass,
2394 bool preloaded,
2395 Trace* trace,
2396 bool first_element_checked,
2397 intptr_t* checked_up_to) {
2398 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2399 bool one_byte = compiler->one_byte();
2400 BlockLabel* backtrack = trace->backtrack();
2401 QuickCheckDetails* quick_check = trace->quick_check_performed();
2402 intptr_t element_count = elms_->length();
2403 intptr_t backward_offset = read_backward() ? -Length() : 0;
2404 for (intptr_t i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2405 TextElement elm = elms_->At(i);
2406 intptr_t cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
2407 if (elm.text_type() == TextElement::ATOM) {
2408 ZoneGrowableArray<uint16_t>* quarks = elm.atom()->data();
2409 for (intptr_t j = preloaded ? 0 : quarks->length() - 1; j >= 0; j--) {
2410 if (SkipPass(pass, elm.atom()->ignore_case())) continue;
2411 if (first_element_checked && i == 0 && j == 0) continue;
2412 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
2413 EmitCharacterFunction* emit_function = nullptr;
2414 uint16_t quark = quarks->At(j);
2415 if (elm.atom()->ignore_case()) {
2416 // Everywhere else we assume that a non-Latin-1 character cannot match
2417 // a Latin-1 character. Avoid the cases where this is assumption is
2418 // invalid by using the Latin1 equivalent instead.
2419 quark = Latin1::TryConvertToLatin1(quark);
2420 }
2421 switch (pass) {
2422 case NON_LATIN1_MATCH:
2423 ASSERT(one_byte);
2424 if (quark > Symbols::kMaxOneCharCodeSymbol) {
2425 assembler->GoTo(backtrack);
2426 return;
2427 }
2428 break;
2429 case NON_LETTER_CHARACTER_MATCH:
2430 emit_function = &EmitAtomNonLetter;
2431 break;
2432 case SIMPLE_CHARACTER_MATCH:
2433 emit_function = &EmitSimpleCharacter;
2434 break;
2435 case CASE_CHARACTER_MATCH:
2436 emit_function = &EmitAtomLetter;
2437 break;
2438 default:
2439 break;
2440 }
2441 if (emit_function != nullptr) {
2442 const bool bounds_check =
2443 *checked_up_to < (cp_offset + j) || read_backward();
2444 bool bound_checked =
2445 emit_function(Z, compiler, quarks->At(j), backtrack,
2446 cp_offset + j, bounds_check, preloaded);
2447 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2448 }
2449 }
2450 } else {
2451 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2452 if (pass == CHARACTER_CLASS_MATCH) {
2453 if (first_element_checked && i == 0) continue;
2454 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
2455 RegExpCharacterClass* cc = elm.char_class();
2456 bool bounds_check = *checked_up_to < cp_offset || read_backward();
2457 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
2458 bounds_check, preloaded, Z);
2459 UpdateBoundsCheck(cp_offset, checked_up_to);
2460 }
2461 }
2462 }
2463}
2464
2465intptr_t TextNode::Length() {
2466 TextElement elm = elms_->Last();
2467 ASSERT(elm.cp_offset() >= 0);
2468 return elm.cp_offset() + elm.length();
2469}
2470
2471bool TextNode::SkipPass(intptr_t intptr_t_pass, bool ignore_case) {
2472 TextEmitPassType pass = static_cast<TextEmitPassType>(intptr_t_pass);
2473 if (ignore_case) {
2474 return pass == SIMPLE_CHARACTER_MATCH;
2475 } else {
2476 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2477 }
2478}
2479
2482 bool read_backward,
2483 RegExpNode* on_success,
2485 ASSERT(ranges != nullptr);
2488 return new TextNode(elms, read_backward, on_success);
2489}
2490
2492 CharacterRange trail,
2493 bool read_backward,
2494 RegExpNode* on_success,
2496 auto lead_ranges = CharacterRange::List(on_success->zone(), lead);
2497 auto trail_ranges = CharacterRange::List(on_success->zone(), trail);
2498 auto elms = new ZoneGrowableArray<TextElement>(2);
2499
2500 elms->Add(
2502 elms->Add(
2504
2505 return new TextNode(elms, read_backward, on_success);
2506}
2507
2508// This generates the code to match a text node. A text node can contain
2509// straight character sequences (possibly to be matched in a case-independent
2510// way) and character classes. For efficiency we do not do this in a single
2511// pass from left to right. Instead we pass over the text node several times,
2512// emitting code for some character positions every time. See the comment on
2513// TextEmitPass for details.
2515 LimitResult limit_result = LimitVersions(compiler, trace);
2516 if (limit_result == DONE) return;
2517 ASSERT(limit_result == CONTINUE);
2518
2519 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2520 compiler->SetRegExpTooBig();
2521 return;
2522 }
2523
2524 if (compiler->one_byte()) {
2525 intptr_t dummy = 0;
2526 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
2527 }
2528
2529 bool first_elt_done = false;
2530 intptr_t bound_checked_to = trace->cp_offset() - 1;
2531 bound_checked_to += trace->bound_checked_up_to();
2532
2533 // If a character is preloaded into the current character register then
2534 // check that now.
2535 if (trace->characters_preloaded() == 1) {
2536 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2537 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
2538 false, &bound_checked_to);
2539 }
2540 first_elt_done = true;
2541 }
2542
2543 for (intptr_t pass = kFirstRealPass; pass <= kLastPass; pass++) {
2544 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
2545 first_elt_done, &bound_checked_to);
2546 }
2547
2548 Trace successor_trace(*trace);
2549 // If we advance backward, we may end up at the start.
2550 successor_trace.AdvanceCurrentPositionInTrace(
2551 read_backward() ? -Length() : Length(), compiler);
2552 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
2555 on_success()->Emit(compiler, &successor_trace);
2556}
2557
2559 characters_preloaded_ = 0;
2560}
2561
2564 // We don't have an instruction for shifting the current character register
2565 // down or for using a shifted value for anything so lets just forget that
2566 // we preloaded any characters into it.
2567 characters_preloaded_ = 0;
2568 // Adjust the offsets of the quick check performed information. This
2569 // information is used to find out what we already determined about the
2570 // characters by means of mask and compare.
2571 quick_check_performed_.Advance(by, compiler->one_byte());
2572 cp_offset_ += by;
2573 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2574 compiler->SetRegExpTooBig();
2575 cp_offset_ = 0;
2576 }
2577 bound_checked_up_to_ =
2578 Utils::Maximum(static_cast<intptr_t>(0), bound_checked_up_to_ - by);
2579}
2580
2581void TextNode::MakeCaseIndependent(bool is_one_byte) {
2582 intptr_t element_count = elms_->length();
2583 for (intptr_t i = 0; i < element_count; i++) {
2584 TextElement elm = elms_->At(i);
2585 if (elm.text_type() == TextElement::CHAR_CLASS) {
2587 bool case_equivalents_already_added =
2588 cc->flags().NeedsUnicodeCaseEquivalents();
2589 if (cc->flags().IgnoreCase() && !case_equivalents_already_added) {
2590 // None of the standard character classes is different in the case
2591 // independent case and it slows us down if we don't know that.
2592 if (cc->is_standard()) continue;
2593 CharacterRange::AddCaseEquivalents(cc->ranges(), is_one_byte, Z);
2594 }
2595 }
2596 }
2597}
2598
2600 TextElement elm = elms_->At(elms_->length() - 1);
2601 return elm.cp_offset() + elm.length();
2602}
2603
2606 if (read_backward()) return nullptr;
2607 if (elms_->length() != 1) return nullptr;
2608 TextElement elm = elms_->At(0);
2609 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
2610 RegExpCharacterClass* node = elm.char_class();
2611 ZoneGrowableArray<CharacterRange>* ranges = node->ranges();
2612 if (!CharacterRange::IsCanonical(ranges)) {
2614 }
2615 if (node->is_negated()) {
2616 return ranges->length() == 0 ? on_success() : nullptr;
2617 }
2618 if (ranges->length() != 1) return nullptr;
2619 uint32_t max_char;
2620 if (compiler->one_byte()) {
2622 } else {
2623 max_char = Utf16::kMaxCodeUnit;
2624 }
2625 return ranges->At(0).IsEverything(max_char) ? on_success() : nullptr;
2626}
2627
2628// Finds the fixed match length of a sequence of nodes that goes from
2629// this alternative and back to this choice node. If there are variable
2630// length nodes or other complications in the way then return a sentinel
2631// value indicating that a greedy loop cannot be constructed.
2633 const GuardedAlternative* alternative) {
2634 intptr_t length = 0;
2635 RegExpNode* node = alternative->node();
2636 // Later we will generate code for all these text nodes using recursion
2637 // so we have to limit the max number.
2638 intptr_t recursion_depth = 0;
2639 while (node != this) {
2640 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2641 return kNodeIsTooComplexForGreedyLoops;
2642 }
2643 intptr_t node_length = node->GreedyLoopTextLength();
2644 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2645 return kNodeIsTooComplexForGreedyLoops;
2646 }
2647 length += node_length;
2648 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2649 node = seq_node->on_success();
2650 }
2651 return read_backward() ? -length : length;
2652}
2653
2655 ASSERT(loop_node_ == nullptr);
2656 AddAlternative(alt);
2657 loop_node_ = alt.node();
2658}
2659
2661 ASSERT(continue_node_ == nullptr);
2662 AddAlternative(alt);
2663 continue_node_ = alt.node();
2664}
2665
2667 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2668 if (trace->stop_node() == this) {
2669 // Back edge of greedy optimized loop node graph.
2670 intptr_t text_length =
2671 GreedyLoopTextLengthForAlternative(&alternatives_->At(0));
2672 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2673 // Update the counter-based backtracking info on the stack. This is an
2674 // optimization for greedy loops (see below).
2675 ASSERT(trace->cp_offset() == text_length);
2676 macro_assembler->AdvanceCurrentPosition(text_length);
2677 macro_assembler->GoTo(trace->loop_label());
2678 return;
2679 }
2680 ASSERT(trace->stop_node() == nullptr);
2681 if (!trace->is_trivial()) {
2682 trace->Flush(compiler, this);
2683 return;
2684 }
2685 ChoiceNode::Emit(compiler, trace);
2686}
2687
2688intptr_t ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
2689 intptr_t eats_at_least) {
2690 intptr_t preload_characters =
2691 Utils::Minimum(static_cast<intptr_t>(4), eats_at_least);
2692 if (compiler->one_byte()) {
2693#if !defined(DART_COMPRESSED_POINTERS) && !defined(TARGET_ARCH_RISCV32)
2694 if (preload_characters > 4) preload_characters = 4;
2695 // We can't preload 3 characters because there is no machine instruction
2696 // to do that. We can't just load 4 because we could be reading
2697 // beyond the end of the string, which could cause a memory fault.
2698 if (preload_characters == 3) preload_characters = 2;
2699#else
2700 // Ensure LoadCodeUnitsInstr can always produce a Smi. See
2701 // https://github.com/dart-lang/sdk/issues/29951
2702 if (preload_characters > 2) preload_characters = 2;
2703#endif
2704 } else {
2705#if !defined(DART_COMPRESSED_POINTERS) && !defined(TARGET_ARCH_RISCV32)
2706 if (preload_characters > 2) preload_characters = 2;
2707#else
2708 // Ensure LoadCodeUnitsInstr can always produce a Smi. See
2709 // https://github.com/dart-lang/sdk/issues/29951
2710 if (preload_characters > 1) preload_characters = 1;
2711#endif
2712 }
2713 if (!compiler->macro_assembler()->CanReadUnaligned()) {
2714 if (preload_characters > 1) preload_characters = 1;
2715 }
2716 return preload_characters;
2717}
2718
2719// This structure is used when generating the alternatives in a choice node. It
2720// records the way the alternative is being code generated.
2723 : possible_success(),
2724 expects_preload(false),
2725 after(),
2726 quick_check_details() {}
2731};
2732
2733// Creates a list of AlternativeGenerations. If the list has a reasonable
2734// size then it is on the stack, otherwise the excess is on the heap.
2736 public:
2737 explicit AlternativeGenerationList(intptr_t count) : count_(count) {
2738 ASSERT(count >= 0);
2739 if (count > kAFew) {
2740 excess_alt_gens_.reset(new AlternativeGeneration[count - kAFew]);
2741 }
2742 }
2743
2745 ASSERT(0 <= i);
2746 ASSERT(i < count_);
2747 if (i < kAFew) {
2748 return &a_few_alt_gens_[i];
2749 }
2750 return &excess_alt_gens_[i - kAFew];
2751 }
2752
2753 private:
2754 static constexpr intptr_t kAFew = 10;
2755
2756 intptr_t count_;
2757 AlternativeGeneration a_few_alt_gens_[kAFew];
2758 std::unique_ptr<AlternativeGeneration[]> excess_alt_gens_;
2759
2762};
2763
2764static constexpr int32_t kRangeEndMarker = Utf::kMaxCodePoint + 1;
2765
2766// The '2' variant is inclusive from and exclusive to.
2767// This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
2768// which include WhiteSpace (7.2) or LineTerminator (7.3) values.
2769// 0x180E has been removed from Unicode's Zs category and thus
2770// from ECMAScript's WhiteSpace category as of Unicode 6.3.
2771static constexpr int32_t kSpaceRanges[] = {
2772 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
2773 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
2774 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
2775static constexpr intptr_t kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
2776static constexpr int32_t kWordRanges[] = {
2777 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
2778static constexpr intptr_t kWordRangeCount = ARRAY_SIZE(kWordRanges);
2779static constexpr int32_t kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
2780static constexpr intptr_t kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
2781static constexpr int32_t kSurrogateRanges[] = {0xd800, 0xe000, kRangeEndMarker};
2783static constexpr int32_t kLineTerminatorRanges[] = {
2784 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
2785static constexpr intptr_t kLineTerminatorRangeCount =
2787
2789 SetInterval(Interval(character, character));
2790}
2791
2793 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
2794 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
2795 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
2796 surrogate_ =
2797 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
2798 if (interval.to() - interval.from() >= kMapSize - 1) {
2799 if (map_count_ != kMapSize) {
2800 map_count_ = kMapSize;
2801 for (intptr_t i = 0; i < kMapSize; i++)
2802 (*map_)[i] = true;
2803 }
2804 return;
2805 }
2806 for (intptr_t i = interval.from(); i <= interval.to(); i++) {
2807 intptr_t mod_character = (i & kMask);
2808 if (!map_->At(mod_character)) {
2809 map_count_++;
2810 (*map_)[mod_character] = true;
2811 }
2812 if (map_count_ == kMapSize) return;
2813 }
2814}
2815
2817 s_ = w_ = d_ = kLatticeUnknown;
2818 if (map_count_ != kMapSize) {
2819 map_count_ = kMapSize;
2820 for (intptr_t i = 0; i < kMapSize; i++)
2821 (*map_)[i] = true;
2822 }
2823}
2824
2827 Zone* zone)
2828 : length_(length), compiler_(compiler) {
2829 if (compiler->one_byte()) {
2831 } else {
2832 max_char_ = Utf16::kMaxCodeUnit;
2833 }
2835 for (intptr_t i = 0; i < length; i++) {
2836 bitmaps_->Add(new (zone) BoyerMoorePositionInfo(zone));
2837 }
2838}
2839
2840// Find the longest range of lookahead that has the fewest number of different
2841// characters that can occur at a given position. Since we are optimizing two
2842// different parameters at once this is a tradeoff.
2843bool BoyerMooreLookahead::FindWorthwhileInterval(intptr_t* from, intptr_t* to) {
2844 intptr_t biggest_points = 0;
2845 // If more than 32 characters out of 128 can occur it is unlikely that we can
2846 // be lucky enough to step forwards much of the time.
2847 const intptr_t kMaxMax = 32;
2848 for (intptr_t max_number_of_chars = 4; max_number_of_chars < kMaxMax;
2849 max_number_of_chars *= 2) {
2850 biggest_points =
2851 FindBestInterval(max_number_of_chars, biggest_points, from, to);
2852 }
2853 if (biggest_points == 0) return false;
2854 return true;
2855}
2856
2857// Find the highest-points range between 0 and length_ where the character
2858// information is not too vague. 'Too vague' means that there are more than
2859// max_number_of_chars that can occur at this position. Calculates the number
2860// of points as the product of width-of-the-range and
2861// probability-of-finding-one-of-the-characters, where the probability is
2862// calculated using the frequency distribution of the sample subject string.
2863intptr_t BoyerMooreLookahead::FindBestInterval(intptr_t max_number_of_chars,
2864 intptr_t old_biggest_points,
2865 intptr_t* from,
2866 intptr_t* to) {
2867 intptr_t biggest_points = old_biggest_points;
2868 static constexpr intptr_t kSize = RegExpMacroAssembler::kTableSize;
2869 for (intptr_t i = 0; i < length_;) {
2870 while (i < length_ && Count(i) > max_number_of_chars)
2871 i++;
2872 if (i == length_) break;
2873 intptr_t remembered_from = i;
2874 bool union_map[kSize];
2875 for (intptr_t j = 0; j < kSize; j++)
2876 union_map[j] = false;
2877 while (i < length_ && Count(i) <= max_number_of_chars) {
2878 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2879 for (intptr_t j = 0; j < kSize; j++)
2880 union_map[j] |= map->at(j);
2881 i++;
2882 }
2883 intptr_t frequency = 0;
2884 for (intptr_t j = 0; j < kSize; j++) {
2885 if (union_map[j]) {
2886 // Add 1 to the frequency to give a small per-character boost for
2887 // the cases where our sampling is not good enough and many
2888 // characters have a frequency of zero. This means the frequency
2889 // can theoretically be up to 2*kSize though we treat it mostly as
2890 // a fraction of kSize.
2891 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
2892 }
2893 }
2894 // We use the probability of skipping times the distance we are skipping to
2895 // judge the effectiveness of this. Actually we have a cut-off: By
2896 // dividing by 2 we switch off the skipping if the probability of skipping
2897 // is less than 50%. This is because the multibyte mask-and-compare
2898 // skipping in quickcheck is more likely to do well on this case.
2899 bool in_quickcheck_range =
2900 ((i - remembered_from < 4) ||
2901 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
2902 // Called 'probability' but it is only a rough estimate and can actually
2903 // be outside the 0-kSize range.
2904 intptr_t probability =
2905 (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
2906 intptr_t points = (i - remembered_from) * probability;
2907 if (points > biggest_points) {
2908 *from = remembered_from;
2909 *to = i - 1;
2910 biggest_points = points;
2911 }
2912 }
2913 return biggest_points;
2914}
2915
2916// Take all the characters that will not prevent a successful match if they
2917// occur in the subject string in the range between min_lookahead and
2918// max_lookahead (inclusive) measured from the current position. If the
2919// character at max_lookahead offset is not one of these characters, then we
2920// can safely skip forwards by the number of characters in the range.
2921intptr_t BoyerMooreLookahead::GetSkipTable(
2922 intptr_t min_lookahead,
2923 intptr_t max_lookahead,
2924 const TypedData& boolean_skip_table) {
2925 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2926
2927 const intptr_t kSkipArrayEntry = 0;
2928 const intptr_t kDontSkipArrayEntry = 1;
2929
2930 for (intptr_t i = 0; i < kSize; i++) {
2931 boolean_skip_table.SetUint8(i, kSkipArrayEntry);
2932 }
2933 intptr_t skip = max_lookahead + 1 - min_lookahead;
2934
2935 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2936 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2937 for (intptr_t j = 0; j < kSize; j++) {
2938 if (map->at(j)) {
2939 boolean_skip_table.SetUint8(j, kDontSkipArrayEntry);
2940 }
2941 }
2942 }
2943
2944 return skip;
2945}
2946
2947// See comment above on the implementation of GetSkipTable.
2949 const intptr_t kSize = RegExpMacroAssembler::kTableSize;
2950
2951 intptr_t min_lookahead = 0;
2952 intptr_t max_lookahead = 0;
2953
2954 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
2955
2956 bool found_single_character = false;
2957 intptr_t single_character = 0;
2958 for (intptr_t i = max_lookahead; i >= min_lookahead; i--) {
2959 BoyerMoorePositionInfo* map = bitmaps_->At(i);
2960 if (map->map_count() > 1 ||
2961 (found_single_character && map->map_count() != 0)) {
2962 found_single_character = false;
2963 break;
2964 }
2965 for (intptr_t j = 0; j < kSize; j++) {
2966 if (map->at(j)) {
2967 found_single_character = true;
2968 single_character = j;
2969 break;
2970 }
2971 }
2972 }
2973
2974 intptr_t lookahead_width = max_lookahead + 1 - min_lookahead;
2975
2976 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
2977 // The mask-compare can probably handle this better.
2978 return;
2979 }
2980
2981 if (found_single_character) {
2982 BlockLabel cont, again;
2983 masm->BindBlock(&again);
2984 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
2985 if (max_char_ > kSize) {
2986 masm->CheckCharacterAfterAnd(single_character,
2988 } else {
2989 masm->CheckCharacter(single_character, &cont);
2990 }
2991 masm->AdvanceCurrentPosition(lookahead_width);
2992 masm->GoTo(&again);
2993 masm->BindBlock(&cont);
2994 return;
2995 }
2996
2997 const TypedData& boolean_skip_table = TypedData::ZoneHandle(
2998 compiler_->zone(),
2999 TypedData::New(kTypedDataUint8ArrayCid, kSize, Heap::kOld));
3000 intptr_t skip_distance =
3001 GetSkipTable(min_lookahead, max_lookahead, boolean_skip_table);
3002 ASSERT(skip_distance != 0);
3003
3004 BlockLabel cont, again;
3005
3006 masm->BindBlock(&again);
3007 masm->CheckPreemption(/*is_backtrack=*/false);
3008 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3009 masm->CheckBitInTable(boolean_skip_table, &cont);
3010 masm->AdvanceCurrentPosition(skip_distance);
3011 masm->GoTo(&again);
3012 masm->BindBlock(&cont);
3013
3014 return;
3015}
3016
3017/* Code generation for choice nodes.
3018 *
3019 * We generate quick checks that do a mask and compare to eliminate a
3020 * choice. If the quick check succeeds then it jumps to the continuation to
3021 * do slow checks and check subsequent nodes. If it fails (the common case)
3022 * it falls through to the next choice.
3023 *
3024 * Here is the desired flow graph. Nodes directly below each other imply
3025 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3026 * 3 doesn't have a quick check so we have to call the slow check.
3027 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3028 * regexp continuation is generated directly after the Sn node, up to the
3029 * next GoTo if we decide to reuse some already generated code. Some
3030 * nodes expect preload_characters to be preloaded into the current
3031 * character register. R nodes do this preloading. Vertices are marked
3032 * F for failures and S for success (possible success in the case of quick
3033 * nodes). L, V, < and > are used as arrow heads.
3034 *
3035 * ----------> R
3036 * |
3037 * V
3038 * Q1 -----> S1
3039 * | S /
3040 * F| /
3041 * | F/
3042 * | /
3043 * | R
3044 * | /
3045 * V L
3046 * Q2 -----> S2
3047 * | S /
3048 * F| /
3049 * | F/
3050 * | /
3051 * | R
3052 * | /
3053 * V L
3054 * S3
3055 * |
3056 * F|
3057 * |
3058 * R
3059 * |
3060 * backtrack V
3061 * <----------Q4
3062 * \ F |
3063 * \ |S
3064 * \ F V
3065 * \-----S4
3066 *
3067 * For greedy loops we push the current position, then generate the code that
3068 * eats the input specially in EmitGreedyLoop. The other choice (the
3069 * continuation) is generated by the normal code in EmitChoices, and steps back
3070 * in the input to the starting position when it fails to match. The loop code
3071 * looks like this (U is the unwind code that steps back in the greedy loop).
3072 *
3073 * _____
3074 * / \
3075 * V |
3076 * ----------> S1 |
3077 * /| |
3078 * / |S |
3079 * F/ \_____/
3080 * /
3081 * |<-----
3082 * | \
3083 * V |S
3084 * Q2 ---> U----->backtrack
3085 * | F /
3086 * S| /
3087 * V F /
3088 * S2--/
3089 */
3090
3092 counter_backtrack_trace_.set_backtrack(&label_);
3093 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3094}
3095
3096void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3097#ifdef DEBUG
3098 intptr_t choice_count = alternatives_->length();
3099 for (intptr_t i = 0; i < choice_count - 1; i++) {
3100 GuardedAlternative alternative = alternatives_->At(i);
3101 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3102 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3103 for (intptr_t j = 0; j < guard_count; j++) {
3104 ASSERT(!trace->mentions_reg(guards->At(j)->reg()));
3105 }
3106 }
3107#endif
3108}
3109
3110void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3111 Trace* current_trace,
3112 PreloadState* state) {
3113 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3114 // Save some time by looking at most one machine word ahead.
3115 state->eats_at_least_ =
3116 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3117 current_trace->at_start() == Trace::FALSE_VALUE);
3118 }
3119 state->preload_characters_ =
3120 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3121
3122 state->preload_is_current_ =
3123 (current_trace->characters_preloaded() == state->preload_characters_);
3124 state->preload_has_checked_bounds_ = state->preload_is_current_;
3125}
3126
3128 intptr_t choice_count = alternatives_->length();
3129
3130 if (choice_count == 1 && alternatives_->At(0).guards() == nullptr) {
3131 alternatives_->At(0).node()->Emit(compiler, trace);
3132 return;
3133 }
3134
3135 AssertGuardsMentionRegisters(trace);
3136
3137 LimitResult limit_result = LimitVersions(compiler, trace);
3138 if (limit_result == DONE) return;
3139 ASSERT(limit_result == CONTINUE);
3140
3141 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3142 // other choice nodes we only flush if we are out of code size budget.
3143 if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3144 trace->Flush(compiler, this);
3145 return;
3146 }
3147
3149
3150 PreloadState preload;
3151 preload.init();
3152 GreedyLoopState greedy_loop_state(not_at_start());
3153
3154 intptr_t text_length =
3156 AlternativeGenerationList alt_gens(choice_count);
3157
3158 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3159 trace = EmitGreedyLoop(compiler, trace, &alt_gens, &preload,
3160 &greedy_loop_state, text_length);
3161 } else {
3162 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3163 // match the traces produced pre-cleanup.
3164 BlockLabel second_choice;
3165 compiler->macro_assembler()->BindBlock(&second_choice);
3166
3167 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3168
3169 EmitChoices(compiler, &alt_gens, 0, trace, &preload);
3170 }
3171
3172 // At this point we need to generate slow checks for the alternatives where
3173 // the quick check was inlined. We can recognize these because the associated
3174 // label was bound.
3175 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3176 for (intptr_t i = 0; i < choice_count; i++) {
3177 AlternativeGeneration* alt_gen = alt_gens.at(i);
3178 Trace new_trace(*trace);
3179 // If there are actions to be flushed we have to limit how many times
3180 // they are flushed. Take the budget of the parent trace and distribute
3181 // it fairly amongst the children.
3182 if (new_trace.actions() != nullptr) {
3183 new_trace.set_flush_budget(new_flush_budget);
3184 }
3185 bool next_expects_preload =
3186 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3187 EmitOutOfLineContinuation(compiler, &new_trace, alternatives_->At(i),
3188 alt_gen, preload.preload_characters_,
3189 next_expects_preload);
3190 }
3191}
3192
3193Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3194 Trace* trace,
3195 AlternativeGenerationList* alt_gens,
3196 PreloadState* preload,
3197 GreedyLoopState* greedy_loop_state,
3198 intptr_t text_length) {
3199 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3200 // Here we have special handling for greedy loops containing only text nodes
3201 // and other simple nodes. These are handled by pushing the current
3202 // position on the stack and then incrementing the current position each
3203 // time around the switch. On backtrack we decrement the current position
3204 // and check it against the pushed value. This avoids pushing backtrack
3205 // information for each iteration of the loop, which could take up a lot of
3206 // space.
3207 ASSERT(trace->stop_node() == nullptr);
3208 macro_assembler->PushCurrentPosition();
3209 BlockLabel greedy_match_failed;
3210 Trace greedy_match_trace;
3211 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3212 greedy_match_trace.set_backtrack(&greedy_match_failed);
3213 BlockLabel loop_label;
3214 macro_assembler->BindBlock(&loop_label);
3215 macro_assembler->CheckPreemption(/*is_backtrack=*/false);
3216 greedy_match_trace.set_stop_node(this);
3217 greedy_match_trace.set_loop_label(&loop_label);
3218 (*alternatives_)[0].node()->Emit(compiler, &greedy_match_trace);
3219 macro_assembler->BindBlock(&greedy_match_failed);
3220
3221 BlockLabel second_choice; // For use in greedy matches.
3222 macro_assembler->BindBlock(&second_choice);
3223
3224 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3225
3226 EmitChoices(compiler, alt_gens, 1, new_trace, preload);
3227
3228 macro_assembler->BindBlock(greedy_loop_state->label());
3229 // If we have unwound to the bottom then backtrack.
3230 macro_assembler->CheckGreedyLoop(trace->backtrack());
3231 // Otherwise try the second priority at an earlier position.
3232 macro_assembler->AdvanceCurrentPosition(-text_length);
3233 macro_assembler->GoTo(&second_choice);
3234 return new_trace;
3235}
3236
3237intptr_t ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3238 Trace* trace) {
3239 intptr_t eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
3240 if (alternatives_->length() != 2) return eats_at_least;
3241
3242 GuardedAlternative alt1 = alternatives_->At(1);
3243 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
3244 return eats_at_least;
3245 }
3246 RegExpNode* eats_anything_node = alt1.node();
3247 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
3248 return eats_at_least;
3249 }
3250
3251 // Really we should be creating a new trace when we execute this function,
3252 // but there is no need, because the code it generates cannot backtrack, and
3253 // we always arrive here with a trivial trace (since it's the entry to a
3254 // loop. That also implies that there are no preloaded characters, which is
3255 // good, because it means we won't be violating any assumptions by
3256 // overwriting those characters with new load instructions.
3257 ASSERT(trace->is_trivial());
3258
3259 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3260 // At this point we know that we are at a non-greedy loop that will eat
3261 // any character one at a time. Any non-anchored regexp has such a
3262 // loop prepended to it in order to find where it starts. We look for
3263 // a pattern of the form ...abc... where we can look 6 characters ahead
3264 // and step forwards 3 if the character is not one of abc. Abc need
3265 // not be atoms, they can be any reasonably limited character class or
3266 // small alternation.
3267 BoyerMooreLookahead* bm = bm_info(false);
3268 if (bm == nullptr) {
3269 eats_at_least = Utils::Minimum(
3272 if (eats_at_least >= 1) {
3273 bm = new (Z) BoyerMooreLookahead(eats_at_least, compiler, Z);
3274 GuardedAlternative alt0 = alternatives_->At(0);
3275 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);
3276 }
3277 }
3278 if (bm != nullptr) {
3279 bm->EmitSkipInstructions(macro_assembler);
3280 }
3281 return eats_at_least;
3282}
3283
3284void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
3285 AlternativeGenerationList* alt_gens,
3286 intptr_t first_choice,
3287 Trace* trace,
3288 PreloadState* preload) {
3289 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3290 SetUpPreLoad(compiler, trace, preload);
3291
3292 // For now we just call all choices one after the other. The idea ultimately
3293 // is to use the Dispatch table to try only the relevant ones.
3294 intptr_t choice_count = alternatives_->length();
3295
3296 intptr_t new_flush_budget = trace->flush_budget() / choice_count;
3297
3298 for (intptr_t i = first_choice; i < choice_count; i++) {
3299 bool is_last = i == choice_count - 1;
3300 bool fall_through_on_failure = !is_last;
3301 GuardedAlternative alternative = alternatives_->At(i);
3302 AlternativeGeneration* alt_gen = alt_gens->at(i);
3303 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
3304 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3305 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3306 Trace new_trace(*trace);
3307 new_trace.set_characters_preloaded(
3308 preload->preload_is_current_ ? preload->preload_characters_ : 0);
3309 if (preload->preload_has_checked_bounds_) {
3310 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3311 }
3312 new_trace.quick_check_performed()->Clear();
3313 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
3314 if (!is_last) {
3315 new_trace.set_backtrack(&alt_gen->after);
3316 }
3317 alt_gen->expects_preload = preload->preload_is_current_;
3318 bool generate_full_check_inline = false;
3319 if (kRegexpOptimization &&
3321 alternative.node()->EmitQuickCheck(
3322 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
3323 &alt_gen->possible_success, &alt_gen->quick_check_details,
3324 fall_through_on_failure)) {
3325 // Quick check was generated for this choice.
3326 preload->preload_is_current_ = true;
3327 preload->preload_has_checked_bounds_ = true;
3328 // If we generated the quick check to fall through on possible success,
3329 // we now need to generate the full check inline.
3330 if (!fall_through_on_failure) {
3331 macro_assembler->BindBlock(&alt_gen->possible_success);
3332 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3333 new_trace.set_characters_preloaded(preload->preload_characters_);
3334 new_trace.set_bound_checked_up_to(preload->preload_characters_);
3335 generate_full_check_inline = true;
3336 }
3337 } else if (alt_gen->quick_check_details.cannot_match()) {
3338 if (!fall_through_on_failure) {
3339 macro_assembler->GoTo(trace->backtrack());
3340 }
3341 continue;
3342 } else {
3343 // No quick check was generated. Put the full code here.
3344 // If this is not the first choice then there could be slow checks from
3345 // previous cases that go here when they fail. There's no reason to
3346 // insist that they preload characters since the slow check we are about
3347 // to generate probably can't use it.
3348 if (i != first_choice) {
3349 alt_gen->expects_preload = false;
3350 new_trace.InvalidateCurrentCharacter();
3351 }
3352 generate_full_check_inline = true;
3353 }
3354 if (generate_full_check_inline) {
3355 if (new_trace.actions() != nullptr) {
3356 new_trace.set_flush_budget(new_flush_budget);
3357 }
3358 for (intptr_t j = 0; j < guard_count; j++) {
3359 GenerateGuard(macro_assembler, guards->At(j), &new_trace);
3360 }
3361 alternative.node()->Emit(compiler, &new_trace);
3362 preload->preload_is_current_ = false;
3363 }
3364 macro_assembler->BindBlock(&alt_gen->after);
3365 }
3366}
3367
3368void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
3369 Trace* trace,
3370 GuardedAlternative alternative,
3371 AlternativeGeneration* alt_gen,
3372 intptr_t preload_characters,
3373 bool next_expects_preload) {
3374 if (!alt_gen->possible_success.is_linked()) return;
3375
3376 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3377 macro_assembler->BindBlock(&alt_gen->possible_success);
3378 Trace out_of_line_trace(*trace);
3379 out_of_line_trace.set_characters_preloaded(preload_characters);
3380 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
3381 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
3382 ZoneGrowableArray<Guard*>* guards = alternative.guards();
3383 intptr_t guard_count = (guards == nullptr) ? 0 : guards->length();
3384 if (next_expects_preload) {
3385 BlockLabel reload_current_char;
3386 out_of_line_trace.set_backtrack(&reload_current_char);
3387 for (intptr_t j = 0; j < guard_count; j++) {
3388 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3389 }
3390 alternative.node()->Emit(compiler, &out_of_line_trace);
3391 macro_assembler->BindBlock(&reload_current_char);
3392 // Reload the current character, since the next quick check expects that.
3393 // We don't need to check bounds here because we only get into this
3394 // code through a quick check which already did the checked load.
3395 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
3396 preload_characters);
3397 macro_assembler->GoTo(&(alt_gen->after));
3398 } else {
3399 out_of_line_trace.set_backtrack(&(alt_gen->after));
3400 for (intptr_t j = 0; j < guard_count; j++) {
3401 GenerateGuard(macro_assembler, guards->At(j), &out_of_line_trace);
3402 }
3403 alternative.node()->Emit(compiler, &out_of_line_trace);
3404 }
3405}
3406
3408 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3409 LimitResult limit_result = LimitVersions(compiler, trace);
3410 if (limit_result == DONE) return;
3411 ASSERT(limit_result == CONTINUE);
3412
3414
3415 switch (action_type_) {
3416 case STORE_POSITION: {
3417 Trace::DeferredCapture new_capture(data_.u_position_register.reg,
3418 data_.u_position_register.is_capture,
3419 trace);
3420 Trace new_trace = *trace;
3421 new_trace.add_action(&new_capture);
3422 on_success()->Emit(compiler, &new_trace);
3423 break;
3424 }
3425 case INCREMENT_REGISTER: {
3427 data_.u_increment_register.reg);
3428 Trace new_trace = *trace;
3429 new_trace.add_action(&new_increment);
3430 on_success()->Emit(compiler, &new_trace);
3431 break;
3432 }
3433 case SET_REGISTER: {
3434 Trace::DeferredSetRegister new_set(data_.u_store_register.reg,
3435 data_.u_store_register.value);
3436 Trace new_trace = *trace;
3437 new_trace.add_action(&new_set);
3438 on_success()->Emit(compiler, &new_trace);
3439 break;
3440 }
3441 case CLEAR_CAPTURES: {
3443 data_.u_clear_captures.range_from, data_.u_clear_captures.range_to));
3444 Trace new_trace = *trace;
3445 new_trace.add_action(&new_capture);
3446 on_success()->Emit(compiler, &new_trace);
3447 break;
3448 }
3449 case BEGIN_SUBMATCH:
3450 if (!trace->is_trivial()) {
3451 trace->Flush(compiler, this);
3452 } else {
3454 data_.u_submatch.current_position_register, 0);
3455 assembler->WriteStackPointerToRegister(
3456 data_.u_submatch.stack_pointer_register);
3457 on_success()->Emit(compiler, trace);
3458 }
3459 break;
3460 case EMPTY_MATCH_CHECK: {
3461 intptr_t start_pos_reg = data_.u_empty_match_check.start_register;
3462 intptr_t stored_pos = 0;
3463 intptr_t rep_reg = data_.u_empty_match_check.repetition_register;
3464 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
3465 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
3466 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
3467 // If we know we haven't advanced and there is no minimum we
3468 // can just backtrack immediately.
3469 assembler->GoTo(trace->backtrack());
3470 } else if (know_dist && stored_pos < trace->cp_offset()) {
3471 // If we know we've advanced we can generate the continuation
3472 // immediately.
3473 on_success()->Emit(compiler, trace);
3474 } else if (!trace->is_trivial()) {
3475 trace->Flush(compiler, this);
3476 } else {
3477 BlockLabel skip_empty_check;
3478 // If we have a minimum number of repetitions we check the current
3479 // number first and skip the empty check if it's not enough.
3480 if (has_minimum) {
3481 intptr_t limit = data_.u_empty_match_check.repetition_limit;
3482 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
3483 }
3484 // If the match is empty we bail out, otherwise we fall through
3485 // to the on-success continuation.
3486 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
3487 trace->backtrack());
3488 assembler->BindBlock(&skip_empty_check);
3489 on_success()->Emit(compiler, trace);
3490 }
3491 break;
3492 }
3494 if (!trace->is_trivial()) {
3495 trace->Flush(compiler, this);
3496 return;
3497 }
3499 data_.u_submatch.current_position_register);
3501 data_.u_submatch.stack_pointer_register);
3502 intptr_t clear_register_count = data_.u_submatch.clear_register_count;
3503 if (clear_register_count == 0) {
3504 on_success()->Emit(compiler, trace);
3505 return;
3506 }
3507 intptr_t clear_registers_from = data_.u_submatch.clear_register_from;
3508 BlockLabel clear_registers_backtrack;
3509 Trace new_trace = *trace;
3510 new_trace.set_backtrack(&clear_registers_backtrack);
3511 on_success()->Emit(compiler, &new_trace);
3512
3513 assembler->BindBlock(&clear_registers_backtrack);
3514 intptr_t clear_registers_to =
3515 clear_registers_from + clear_register_count - 1;
3516 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3517
3518 ASSERT(trace->backtrack() == nullptr);
3519 assembler->Backtrack();
3520 return;
3521 }
3522 default:
3523 UNREACHABLE();
3524 }
3525}
3526
3528 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3529 if (!trace->is_trivial()) {
3530 trace->Flush(compiler, this);
3531 return;
3532 }
3533
3534 LimitResult limit_result = LimitVersions(compiler, trace);
3535 if (limit_result == DONE) return;
3536 ASSERT(limit_result == CONTINUE);
3537
3539
3540 ASSERT(start_reg_ + 1 == end_reg_);
3541 if (flags_.IgnoreCase()) {
3543 start_reg_, read_backward(), flags_.IsUnicode(), trace->backtrack());
3544 } else {
3545 assembler->CheckNotBackReference(start_reg_, read_backward(),
3546 trace->backtrack());
3547 }
3548 // We are going to advance backward, so we may end up at the start.
3550
3551 // Check that the back reference does not end inside a surrogate pair.
3552 if (flags_.IsUnicode() && !compiler->one_byte()) {
3553 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
3554 }
3555
3556 on_success()->Emit(compiler, trace);
3557}
3558
3559// -------------------------------------------------------------------
3560// Dot/dotty output
3561
3562#ifdef DEBUG
3563
3564class DotPrinter : public NodeVisitor {
3565 public:
3566 explicit DotPrinter(bool ignore_case) {}
3567 void PrintNode(const char* label, RegExpNode* node);
3568 void Visit(RegExpNode* node);
3569 void PrintAttributes(RegExpNode* from);
3570 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3571#define DECLARE_VISIT(Type) virtual void Visit##Type(Type##Node* that);
3573#undef DECLARE_VISIT
3574};
3575
3576void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3577 OS::PrintErr("digraph G {\n graph [label=\"");
3578 for (intptr_t i = 0; label[i] != '\0'; i++) {
3579 switch (label[i]) {
3580 case '\\':
3581 OS::PrintErr("\\\\");
3582 break;
3583 case '"':
3584 OS::PrintErr("\"");
3585 break;
3586 default:
3587 OS::PrintErr("%c", label[i]);
3588 break;
3589 }
3590 }
3591 OS::PrintErr("\"];\n");
3592 Visit(node);
3593 OS::PrintErr("}\n");
3594}
3595
3596void DotPrinter::Visit(RegExpNode* node) {
3597 if (node->info()->visited) return;
3598 node->info()->visited = true;
3599 node->Accept(this);
3600}
3601
3602void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3603 OS::PrintErr(" n%p -> n%p [style=dotted];\n", from, on_failure);
3604 Visit(on_failure);
3605}
3606
3607class AttributePrinter : public ValueObject {
3608 public:
3609 AttributePrinter() : first_(true) {}
3610 void PrintSeparator() {
3611 if (first_) {
3612 first_ = false;
3613 } else {
3614 OS::PrintErr("|");
3615 }
3616 }
3617 void PrintBit(const char* name, bool value) {
3618 if (!value) return;
3619 PrintSeparator();
3620 OS::PrintErr("{%s}", name);
3621 }
3622 void PrintPositive(const char* name, intptr_t value) {
3623 if (value < 0) return;
3624 PrintSeparator();
3625 OS::PrintErr("{%s|%" Pd "}", name, value);
3626 }
3627
3628 private:
3629 bool first_;
3630};
3631
3632void DotPrinter::PrintAttributes(RegExpNode* that) {
3634 " a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3635 "margin=0.1, fontsize=10, label=\"{",
3636 that);
3637 AttributePrinter printer;
3638 NodeInfo* info = that->info();
3639 printer.PrintBit("NI", info->follows_newline_interest);
3640 printer.PrintBit("WI", info->follows_word_interest);
3641 printer.PrintBit("SI", info->follows_start_interest);
3642 BlockLabel* label = that->label();
3643 if (label->is_bound()) printer.PrintPositive("@", label->pos());
3645 "}\"];\n"
3646 " a%p -> n%p [style=dashed, color=grey, arrowhead=none];\n",
3647 that, that);
3648}
3649
3650void DotPrinter::VisitChoice(ChoiceNode* that) {
3651 OS::PrintErr(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3652 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3653 GuardedAlternative alt = that->alternatives()->At(i);
3654 OS::PrintErr(" n%p -> n%p", that, alt.node());
3655 }
3656 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
3657 GuardedAlternative alt = that->alternatives()->At(i);
3658 alt.node()->Accept(this);
3659 }
3660}
3661
3662void DotPrinter::VisitText(TextNode* that) {
3663 OS::PrintErr(" n%p [label=\"", that);
3664 for (intptr_t i = 0; i < that->elements()->length(); i++) {
3665 if (i > 0) OS::PrintErr(" ");
3666 TextElement elm = that->elements()->At(i);
3667 switch (elm.text_type()) {
3668 case TextElement::ATOM: {
3669 ZoneGrowableArray<uint16_t>* data = elm.atom()->data();
3670 for (intptr_t i = 0; i < data->length(); i++) {
3671 OS::PrintErr("%c", static_cast<char>(data->At(i)));
3672 }
3673 break;
3674 }
3676 RegExpCharacterClass* node = elm.char_class();
3677 OS::PrintErr("[");
3678 if (node->is_negated()) OS::PrintErr("^");
3679 for (intptr_t j = 0; j < node->ranges()->length(); j++) {
3680 CharacterRange range = node->ranges()->At(j);
3681 PrintUtf16(range.from());
3682 OS::PrintErr("-");
3683 PrintUtf16(range.to());
3684 }
3685 OS::PrintErr("]");
3686 break;
3687 }
3688 default:
3689 UNREACHABLE();
3690 }
3691 }
3692 OS::PrintErr("\", shape=box, peripheries=2];\n");
3693 PrintAttributes(that);
3694 OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3695 Visit(that->on_success());
3696}
3697
3698void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3699 OS::PrintErr(" n%p [label=\"$%" Pd "..$%" Pd "\", shape=doubleoctagon];\n",
3700 that, that->start_register(), that->end_register());
3701 PrintAttributes(that);
3702 OS::PrintErr(" n%p -> n%p;\n", that, that->on_success());
3703 Visit(that->on_success());
3704}
3705
3706void DotPrinter::VisitEnd(EndNode* that) {
3707 OS::PrintErr(" n%p [style=bold, shape=point];\n", that);
3708 PrintAttributes(that);
3709}
3710
3711void DotPrinter::VisitAssertion(AssertionNode* that) {
3712 OS::PrintErr(" n%p [", that);
3713 switch (that->assertion_type()) {
3715 OS::PrintErr("label=\"$\", shape=septagon");
3716 break;
3718 OS::PrintErr("label=\"^\", shape=septagon");
3719 break;
3721 OS::PrintErr("label=\"\\b\", shape=septagon");
3722 break;
3724 OS::PrintErr("label=\"\\B\", shape=septagon");
3725 break;
3727 OS::PrintErr("label=\"(?<=\\n)\", shape=septagon");
3728 break;
3729 }
3730 OS::PrintErr("];\n");
3731 PrintAttributes(that);
3732 RegExpNode* successor = that->on_success();
3733 OS::PrintErr(" n%p -> n%p;\n", that, successor);
3734 Visit(successor);
3735}
3736
3737void DotPrinter::VisitAction(ActionNode* that) {
3738 OS::PrintErr(" n%p [", that);
3739 switch (that->action_type_) {
3741 OS::PrintErr("label=\"$%" Pd ":=%" Pd "\", shape=octagon",
3742 that->data_.u_store_register.reg,
3743 that->data_.u_store_register.value);
3744 break;
3746 OS::PrintErr("label=\"$%" Pd "++\", shape=octagon",
3747 that->data_.u_increment_register.reg);
3748 break;
3750 OS::PrintErr("label=\"$%" Pd ":=$pos\", shape=octagon",
3751 that->data_.u_position_register.reg);
3752 break;
3754 OS::PrintErr("label=\"$%" Pd ":=$pos,begin\", shape=septagon",
3755 that->data_.u_submatch.current_position_register);
3756 break;
3758 OS::PrintErr("label=\"escape\", shape=septagon");
3759 break;
3761 OS::PrintErr("label=\"$%" Pd "=$pos?,$%" Pd "<%" Pd "?\", shape=septagon",
3762 that->data_.u_empty_match_check.start_register,
3763 that->data_.u_empty_match_check.repetition_register,
3764 that->data_.u_empty_match_check.repetition_limit);
3765 break;
3767 OS::PrintErr("label=\"clear $%" Pd " to $%" Pd "\", shape=septagon",
3768 that->data_.u_clear_captures.range_from,
3769 that->data_.u_clear_captures.range_to);
3770 break;
3771 }
3772 }
3773 OS::PrintErr("];\n");
3774 PrintAttributes(that);
3775 RegExpNode* successor = that->on_success();
3776 OS::PrintErr(" n%p -> n%p;\n", that, successor);
3777 Visit(successor);
3778}
3779
3780void RegExpEngine::DotPrint(const char* label,
3781 RegExpNode* node,
3782 bool ignore_case) {
3783 DotPrinter printer(ignore_case);
3784 printer.PrintNode(label, node);
3785}
3786
3787#endif // DEBUG
3788
3789// -------------------------------------------------------------------
3790// Tree to graph conversion
3791
3792// The zone in which we allocate graph nodes.
3793#define OZ (on_success->zone())
3794
3796 RegExpNode* on_success) {
3799 elms->Add(TextElement::Atom(this));
3800 return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
3801}
3802
3804 RegExpNode* on_success) {
3807 for (intptr_t i = 0; i < elements()->length(); i++) {
3808 elms->Add(elements()->At(i));
3809 }
3810 return new (OZ) TextNode(elms, compiler->read_backward(), on_success);
3811}
3812
3814 const int32_t* special_class,
3815 intptr_t length) {
3816 length--; // Remove final kRangeEndMarker.
3817 ASSERT(special_class[length] == kRangeEndMarker);
3818 ASSERT(ranges->length() != 0);
3819 ASSERT(length != 0);
3820 ASSERT(special_class[0] != 0);
3821 if (ranges->length() != (length >> 1) + 1) {
3822 return false;
3823 }
3824 CharacterRange range = ranges->At(0);
3825 if (range.from() != 0) {
3826 return false;
3827 }
3828 for (intptr_t i = 0; i < length; i += 2) {
3829 if (special_class[i] != (range.to() + 1)) {
3830 return false;
3831 }
3832 range = ranges->At((i >> 1) + 1);
3833 if (special_class[i + 1] != range.from()) {
3834 return false;
3835 }
3836 }
3837 if (range.to() != Utf::kMaxCodePoint) {
3838 return false;
3839 }
3840 return true;
3841}
3842
3844 const int32_t* special_class,
3845 intptr_t length) {
3846 length--; // Remove final kRangeEndMarker.
3847 ASSERT(special_class[length] == kRangeEndMarker);
3848 if (ranges->length() * 2 != length) {
3849 return false;
3850 }
3851 for (intptr_t i = 0; i < length; i += 2) {
3852 CharacterRange range = ranges->At(i >> 1);
3853 if (range.from() != special_class[i] ||
3854 range.to() != special_class[i + 1] - 1) {
3855 return false;
3856 }
3857 }
3858 return true;
3859}
3860
3862 // TODO(lrn): Remove need for this function, by not throwing away information
3863 // along the way.
3864 if (is_negated()) {
3865 return false;
3866 }
3867 if (set_.is_standard()) {
3868 return true;
3869 }
3871 set_.set_standard_set_type('s');
3872 return true;
3873 }
3875 set_.set_standard_set_type('S');
3876 return true;
3877 }
3880 set_.set_standard_set_type('.');
3881 return true;
3882 }
3885 set_.set_standard_set_type('n');
3886 return true;
3887 }
3889 set_.set_standard_set_type('w');
3890 return true;
3891 }
3893 set_.set_standard_set_type('W');
3894 return true;
3895 }
3896 return false;
3897}
3898
3900 Zone* zone,
3902 : zone_(zone),
3903 table_(zone),
3904 bmp_(nullptr),
3905 lead_surrogates_(nullptr),
3906 trail_surrogates_(nullptr),
3907 non_bmp_(nullptr) {
3908 // The unicode range splitter categorizes given character ranges into:
3909 // - Code points from the BMP representable by one code unit.
3910 // - Code points outside the BMP that need to be split into surrogate pairs.
3911 // - Lone lead surrogates.
3912 // - Lone trail surrogates.
3913 // Lone surrogates are valid code points, even though no actual characters.
3914 // They require special matching to make sure we do not split surrogate pairs.
3915 // We use the dispatch table to accomplish this. The base range is split up
3916 // by the table by the overlay ranges, and the Call callback is used to
3917 // filter and collect ranges for each category.
3918 for (intptr_t i = 0; i < base->length(); i++) {
3919 table_.AddRange(base->At(i), kBase, zone_);
3920 }
3921 // Add overlay ranges.
3923 kBmpCodePoints, zone_);
3926 kLeadSurrogates, zone_);
3929 kTrailSurrogates, zone_);
3930 table_.AddRange(
3932 kBmpCodePoints, zone_);
3933 table_.AddRange(
3935 kNonBmpCodePoints, zone_);
3936 table_.ForEach(this);
3937}
3938
3940 OutSet* outset = entry.out_set();
3941 if (!outset->Get(kBase)) return;
3943 if (outset->Get(kBmpCodePoints)) {
3944 target = &bmp_;
3945 } else if (outset->Get(kLeadSurrogates)) {
3946 target = &lead_surrogates_;
3947 } else if (outset->Get(kTrailSurrogates)) {
3948 target = &trail_surrogates_;
3949 } else {
3950 ASSERT(outset->Get(kNonBmpCodePoints));
3951 target = &non_bmp_;
3952 }
3953 if (*target == nullptr) {
3954 *target = new (zone_) ZoneGrowableArray<CharacterRange>(2);
3955 }
3956 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()));
3957}
3958
3961 RegExpNode* on_success,
3962 UnicodeRangeSplitter* splitter) {
3963 ZoneGrowableArray<CharacterRange>* bmp = splitter->bmp();
3964 if (bmp == nullptr) return;
3966 bmp, compiler->read_backward(), on_success, RegExpFlags())));
3967}
3968
3971 RegExpNode* on_success,
3972 UnicodeRangeSplitter* splitter) {
3973 ZoneGrowableArray<CharacterRange>* non_bmp = splitter->non_bmp();
3974 if (non_bmp == nullptr) return;
3975 ASSERT(!compiler->one_byte());
3977 for (int i = 0; i < non_bmp->length(); i++) {
3978 // Match surrogate pair.
3979 // E.g. [\u10005-\u11005] becomes
3980 // \ud800[\udc05-\udfff]|
3981 // [\ud801-\ud803][\udc00-\udfff]|
3982 // \ud804[\udc00-\udc05]
3983 uint32_t from = non_bmp->At(i).from();
3984 uint32_t to = non_bmp->At(i).to();
3985 uint16_t from_points[2];
3986 Utf16::Encode(from, from_points);
3987 uint16_t to_points[2];
3988 Utf16::Encode(to, to_points);
3989 if (from_points[0] == to_points[0]) {
3990 // The lead surrogate is the same.
3991 result->AddAlternative(
3993 CharacterRange::Singleton(from_points[0]),
3994 CharacterRange::Range(from_points[1], to_points[1]),
3995 compiler->read_backward(), on_success, RegExpFlags())));
3996 } else {
3997 if (from_points[1] != Utf16::kTrailSurrogateStart) {
3998 // Add [from_l][from_t-\udfff]
3999 result->AddAlternative(
4001 CharacterRange::Singleton(from_points[0]),
4002 CharacterRange::Range(from_points[1],
4004 compiler->read_backward(), on_success, RegExpFlags())));
4005 from_points[0]++;
4006 }
4007 if (to_points[1] != Utf16::kTrailSurrogateEnd) {
4008 // Add [to_l][\udc00-to_t]
4009 result->AddAlternative(
4011 CharacterRange::Singleton(to_points[0]),
4013 to_points[1]),
4014 compiler->read_backward(), on_success, RegExpFlags())));
4015 to_points[0]--;
4016 }
4017 if (from_points[0] <= to_points[0]) {
4018 // Add [from_l-to_l][\udc00-\udfff]
4019 result->AddAlternative(
4021 CharacterRange::Range(from_points[0], to_points[0]),
4024 compiler->read_backward(), on_success, RegExpFlags())));
4025 }
4026 }
4027 }
4028}
4029
4034 RegExpNode* on_success,
4035 bool read_backward,
4038 match, read_backward, on_success, flags);
4039 int stack_register = compiler->UnicodeLookaroundStackRegister();
4040 int position_register = compiler->UnicodeLookaroundPositionRegister();
4041 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4042 position_register);
4044 lookbehind, !read_backward, lookaround.on_match_success(), flags);
4045 return lookaround.ForMatch(negative_match);
4046}
4047
4052 RegExpNode* on_success,
4053 bool read_backward,
4055 int stack_register = compiler->UnicodeLookaroundStackRegister();
4056 int position_register = compiler->UnicodeLookaroundPositionRegister();
4057 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4058 position_register);
4060 lookahead, read_backward, lookaround.on_match_success(), flags);
4062 match, read_backward, lookaround.ForMatch(negative_match), flags);
4063}
4064
4067 RegExpNode* on_success,
4068 UnicodeRangeSplitter* splitter) {
4069 auto lead_surrogates = splitter->lead_surrogates();
4070 if (lead_surrogates == nullptr) return;
4071 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4072 auto trail_surrogates = CharacterRange::List(
4075
4077 if (compiler->read_backward()) {
4078 // Reading backward. Assert that reading forward, there is no trail
4079 // surrogate, and then backward match the lead surrogate.
4081 compiler, trail_surrogates, lead_surrogates, on_success, true,
4082 RegExpFlags());
4083 } else {
4084 // Reading forward. Forward match the lead surrogate and assert that
4085 // no trail surrogate follows.
4087 compiler, lead_surrogates, trail_surrogates, on_success, false,
4088 RegExpFlags());
4089 }
4090 result->AddAlternative(GuardedAlternative(match));
4091}
4092
4095 RegExpNode* on_success,
4096 UnicodeRangeSplitter* splitter) {
4097 auto trail_surrogates = splitter->trail_surrogates();
4098 if (trail_surrogates == nullptr) return;
4099 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
4100 auto lead_surrogates = CharacterRange::List(
4103
4105 if (compiler->read_backward()) {
4106 // Reading backward. Backward match the trail surrogate and assert that no
4107 // lead surrogate precedes it.
4109 compiler, trail_surrogates, lead_surrogates, on_success, true,
4110 RegExpFlags());
4111 } else {
4112 // Reading forward. Assert that reading backward, there is no lead
4113 // surrogate, and then forward match the trail surrogate.
4115 compiler, lead_surrogates, trail_surrogates, on_success, false,
4116 RegExpFlags());
4117 }
4118 result->AddAlternative(GuardedAlternative(match));
4119}
4120
4122 RegExpNode* on_success) {
4123 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
4124 ASSERT(!compiler->read_backward());
4125 // Advance any character. If the character happens to be a lead surrogate and
4126 // we advanced into the middle of a surrogate pair, it will work out, as
4127 // nothing will match from there. We will have to advance again, consuming
4128 // the associated trail surrogate.
4129 auto range = CharacterRange::List(
4131 return TextNode::CreateForCharacterRanges(range, false, on_success,
4132 RegExpFlags());
4133}
4134
4137
4138 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
4139 // See also https://crbug.com/v8/6727.
4140 // TODO(sstrickl): This only covers the special case of the {0,0x10FFFF}
4141 // range, which we use frequently internally. But large ranges can also easily
4142 // be created by the user. We might want to have a more general caching
4143 // mechanism for such ranges.
4144 if (ranges->length() == 1 && ranges->At(0).IsEverything(Utf::kMaxCodePoint)) {
4145 return;
4146 }
4147
4148 icu::UnicodeSet set;
4149 for (int i = 0; i < ranges->length(); i++) {
4150 set.add(ranges->At(i).from(), ranges->At(i).to());
4151 }
4152 ranges->Clear();
4153 set.closeOver(USET_CASE_INSENSITIVE);
4154 // Full case mapping map single characters to multiple characters.
4155 // Those are represented as strings in the set. Remove them so that
4156 // we end up with only simple and common case mappings.
4157 set.removeAllStrings();
4158 for (int i = 0; i < set.getRangeCount(); i++) {
4159 ranges->Add(
4160 CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)));
4161 }
4162 // No errors and everything we collected have been ranges.
4164}
4165
4167 RegExpNode* on_success) {
4168 set_.Canonicalize();
4170 if (flags_.NeedsUnicodeCaseEquivalents()) {
4172 }
4173 if (flags_.IsUnicode() && !compiler->one_byte() &&
4175 if (is_negated()) {
4179 ranges = negated;
4180 }
4181 if (ranges->length() == 0) {
4184 return new TextNode(fail, compiler->read_backward(), on_success);
4185 }
4186 if (standard_type() == '*') {
4187 return UnanchoredAdvance(compiler, on_success);
4188 } else {
4189 ChoiceNode* result = new (OZ) ChoiceNode(2, OZ);
4190 UnicodeRangeSplitter splitter(OZ, ranges);
4191 AddBmpCharacters(compiler, result, on_success, &splitter);
4192 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
4193 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
4194 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
4195 return result;
4196 }
4197 } else {
4198 return new TextNode(this, compiler->read_backward(), on_success);
4199 }
4200 return new (OZ) TextNode(this, compiler->read_backward(), on_success);
4201}
4202
4204 RegExpNode* on_success) {
4206 intptr_t length = alternatives->length();
4208 for (intptr_t i = 0; i < length; i++) {
4209 GuardedAlternative alternative(
4210 alternatives->At(i)->ToNode(compiler, on_success));
4211 result->AddAlternative(alternative);
4212 }
4213 return result;
4214}
4215
4217 RegExpNode* on_success) {
4218 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
4219}
4220
4221// Scoped object to keep track of how much we unroll quantifier loops in the
4222// regexp graph generator.
4224 public:
4225 static constexpr intptr_t kMaxExpansionFactor = 6;
4227 : compiler_(compiler),
4228 saved_expansion_factor_(compiler->current_expansion_factor()),
4229 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4230 ASSERT(factor > 0);
4231 if (ok_to_expand_) {
4232 if (factor > kMaxExpansionFactor) {
4233 // Avoid integer overflow of the current expansion factor.
4234 ok_to_expand_ = false;
4235 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4236 } else {
4237 intptr_t new_factor = saved_expansion_factor_ * factor;
4238 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4239 compiler->set_current_expansion_factor(new_factor);
4240 }
4241 }
4242 }
4243
4245 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4246 }
4247
4248 bool ok_to_expand() { return ok_to_expand_; }
4249
4250 private:
4251 RegExpCompiler* compiler_;
4252 intptr_t saved_expansion_factor_;
4253 bool ok_to_expand_;
4254
4255 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4256};
4257
4259 intptr_t max,
4260 bool is_greedy,
4261 RegExpTree* body,
4263 RegExpNode* on_success,
4264 bool not_at_start) {
4265 // x{f, t} becomes this:
4266 //
4267 // (r++)<-.
4268 // | `
4269 // | (x)
4270 // v ^
4271 // (r=0)-->(?)---/ [if r < t]
4272 // |
4273 // [if r >= f] \----> ...
4274 //
4275
4276 // 15.10.2.5 RepeatMatcher algorithm.
4277 // The parser has already eliminated the case where max is 0. In the case
4278 // where max_match is zero the parser has removed the quantifier if min was
4279 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4280
4281 // If we know that we cannot match zero length then things are a little
4282 // simpler since we don't need to make the special zero length match check
4283 // from step 2.1. If the min and max are small we can unroll a little in
4284 // this case.
4285 // Unroll (foo)+ and (foo){3,}
4286 const intptr_t kMaxUnrolledMinMatches = 3;
4287 // Unroll (foo)? and (foo){x,3}
4288 const intptr_t kMaxUnrolledMaxMatches = 3;
4289 if (max == 0) return on_success; // This can happen due to recursion.
4290 bool body_can_be_empty = (body->min_match() == 0);
4291 intptr_t body_start_reg = RegExpCompiler::kNoRegister;
4292 Interval capture_registers = body->CaptureRegisters();
4293 bool needs_capture_clearing = !capture_registers.is_empty();
4294 Zone* zone = compiler->zone();
4295
4296 if (body_can_be_empty) {
4297 body_start_reg = compiler->AllocateRegister();
4298 } else if (kRegexpOptimization && !needs_capture_clearing) {
4299 // Only unroll if there are no captures and the body can't be
4300 // empty.
4301 {
4302 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
4303 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4304 intptr_t new_max = (max == kInfinity) ? max : max - min;
4305 // Recurse once to get the loop or optional matches after the fixed
4306 // ones.
4307 RegExpNode* answer =
4308 ToNode(0, new_max, is_greedy, body, compiler, on_success, true);
4309 // Unroll the forced matches from 0 to min. This can cause chains of
4310 // TextNodes (which the parser does not generate). These should be
4311 // combined if it turns out they hinder good code generation.
4312 for (intptr_t i = 0; i < min; i++) {
4313 answer = body->ToNode(compiler, answer);
4314 }
4315 return answer;
4316 }
4317 }
4318 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4319 ASSERT(max > 0); // Due to the 'if' above.
4321 if (limiter.ok_to_expand()) {
4322 // Unroll the optional matches up to max.
4323 RegExpNode* answer = on_success;
4324 for (intptr_t i = 0; i < max; i++) {
4325 ChoiceNode* alternation = new (zone) ChoiceNode(2, zone);
4326 if (is_greedy) {
4327 alternation->AddAlternative(
4329 alternation->AddAlternative(GuardedAlternative(on_success));
4330 } else {
4331 alternation->AddAlternative(GuardedAlternative(on_success));
4332 alternation->AddAlternative(
4334 }
4335 answer = alternation;
4336 if (not_at_start && !compiler->read_backward()) {
4337 alternation->set_not_at_start();
4338 }
4339 }
4340 return answer;
4341 }
4342 }
4343 }
4344 bool has_min = min > 0;
4345 bool has_max = max < RegExpTree::kInfinity;
4346 bool needs_counter = has_min || has_max;
4347 intptr_t reg_ctr = needs_counter ? compiler->AllocateRegister()
4349 LoopChoiceNode* center = new (zone)
4350 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
4351 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
4352 RegExpNode* loop_return =
4353 needs_counter ? static_cast<RegExpNode*>(
4354 ActionNode::IncrementRegister(reg_ctr, center))
4355 : static_cast<RegExpNode*>(center);
4356 if (body_can_be_empty) {
4357 // If the body can be empty we need to check if it was and then
4358 // backtrack.
4359 loop_return =
4360 ActionNode::EmptyMatchCheck(body_start_reg, reg_ctr, min, loop_return);
4361 }
4362 RegExpNode* body_node = body->ToNode(compiler, loop_return);
4363 if (body_can_be_empty) {
4364 // If the body can be empty we need to store the start position
4365 // so we can bail out if it was empty.
4366 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
4367 }
4368 if (needs_capture_clearing) {
4369 // Before entering the body of this loop we need to clear captures.
4370 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4371 }
4372 GuardedAlternative body_alt(body_node);
4373 if (has_max) {
4374 Guard* body_guard = new (zone) Guard(reg_ctr, Guard::LT, max);
4375 body_alt.AddGuard(body_guard, zone);
4376 }
4377 GuardedAlternative rest_alt(on_success);
4378 if (has_min) {
4379 Guard* rest_guard = new (zone) Guard(reg_ctr, Guard::GEQ, min);
4380 rest_alt.AddGuard(rest_guard, zone);
4381 }
4382 if (is_greedy) {
4383 center->AddLoopAlternative(body_alt);
4384 center->AddContinueAlternative(rest_alt);
4385 } else {
4386 center->AddContinueAlternative(rest_alt);
4387 center->AddLoopAlternative(body_alt);
4388 }
4389 if (needs_counter) {
4390 return ActionNode::SetRegister(reg_ctr, 0, center);
4391 } else {
4392 return center;
4393 }
4394}
4395
4396namespace {
4397// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
4398// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
4399RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
4400 RegExpNode* on_success,
4403 ASSERT(flags.NeedsUnicodeCaseEquivalents());
4406 CharacterRange::AddClassEscape('w', word_range, true);
4407 int stack_register = compiler->UnicodeLookaroundStackRegister();
4408 int position_register = compiler->UnicodeLookaroundPositionRegister();
4409 ChoiceNode* result = new (OZ) ChoiceNode(2, OZ);
4410 // Add two choices. The (non-)boundary could start with a word or
4411 // a non-word-character.
4412 for (int i = 0; i < 2; i++) {
4413 bool lookbehind_for_word = i == 0;
4414 bool lookahead_for_word =
4415 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
4416 // Look to the left.
4417 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
4418 stack_register, position_register);
4420 word_range, true, lookbehind.on_match_success(), flags);
4421 // Look to the right.
4422 RegExpLookaround::Builder lookahead(lookahead_for_word,
4423 lookbehind.ForMatch(backward),
4424 stack_register, position_register);
4426 word_range, false, lookahead.on_match_success(), flags);
4427 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
4428 }
4429 return result;
4430}
4431} // anonymous namespace
4432
4434 RegExpNode* on_success) {
4435 switch (assertion_type()) {
4436 case START_OF_LINE:
4437 return AssertionNode::AfterNewline(on_success);
4438 case START_OF_INPUT:
4439 return AssertionNode::AtStart(on_success);
4440 case BOUNDARY:
4441 return flags_.NeedsUnicodeCaseEquivalents()
4442 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
4443 flags_)
4444 : AssertionNode::AtBoundary(on_success);
4445 case NON_BOUNDARY:
4446 return flags_.NeedsUnicodeCaseEquivalents()
4447 ? BoundaryAssertionAsLookaround(compiler, on_success,
4448 NON_BOUNDARY, flags_)
4449 : AssertionNode::AtNonBoundary(on_success);
4450 case END_OF_INPUT:
4451 return AssertionNode::AtEnd(on_success);
4452 case END_OF_LINE: {
4453 // Compile $ in multiline regexps as an alternation with a positive
4454 // lookahead in one side and an end-of-input on the other side.
4455 // We need two registers for the lookahead.
4456 intptr_t stack_pointer_register = compiler->AllocateRegister();
4457 intptr_t position_register = compiler->AllocateRegister();
4458 // The ChoiceNode to distinguish between a newline and end-of-input.
4459 ChoiceNode* result = new ChoiceNode(2, on_success->zone());
4460 // Create a newline atom.
4461 ZoneGrowableArray<CharacterRange>* newline_ranges =
4463 CharacterRange::AddClassEscape('n', newline_ranges);
4464 RegExpCharacterClass* newline_atom =
4466 TextNode* newline_matcher =
4467 new TextNode(newline_atom, /*read_backwards=*/false,
4469 stack_pointer_register, position_register,
4470 0, // No captures inside.
4471 -1, // Ignored if no captures.
4472 on_success));
4473 // Create an end-of-input matcher.
4475 stack_pointer_register, position_register, newline_matcher);
4476 // Add the two alternatives to the ChoiceNode.
4477 GuardedAlternative eol_alternative(end_of_line);
4478 result->AddAlternative(eol_alternative);
4479 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
4480 result->AddAlternative(end_alternative);
4481 return result;
4482 }
4483 default:
4484 UNREACHABLE();
4485 }
4486 return on_success;
4487}
4488
4490 RegExpNode* on_success) {
4493 compiler->read_backward(), on_success);
4494}
4495
4497 RegExpNode* on_success) {
4498 return on_success;
4499}
4500
4502 RegExpNode* on_success,
4503 intptr_t stack_pointer_register,
4504 intptr_t position_register,
4505 intptr_t capture_register_count,
4506 intptr_t capture_register_start)
4507 : is_positive_(is_positive),
4508 on_success_(on_success),
4509 stack_pointer_register_(stack_pointer_register),
4510 position_register_(position_register) {
4511 if (is_positive_) {
4512 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
4513 stack_pointer_register, position_register, capture_register_count,
4514 capture_register_start, on_success);
4515 } else {
4516 on_match_success_ = new (OZ) NegativeSubmatchSuccess(
4517 stack_pointer_register, position_register, capture_register_count,
4518 capture_register_start, OZ);
4519 }
4520}
4521
4523 if (is_positive_) {
4524 return ActionNode::BeginSubmatch(stack_pointer_register_,
4525 position_register_, match);
4526 } else {
4527 Zone* zone = on_success_->zone();
4528 // We use a ChoiceNode to represent the negative lookaround. The first
4529 // alternative is the negative match. On success, the end node backtracks.
4530 // On failure, the second alternative is tried and leads to success.
4531 // NegativeLookaroundChoiceNode is a special ChoiceNode that ignores the
4532 // first exit when calculating quick checks.
4533 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
4534 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
4535 return ActionNode::BeginSubmatch(stack_pointer_register_,
4536 position_register_, choice_node);
4537 }
4538}
4539
4541 RegExpNode* on_success) {
4542 intptr_t stack_pointer_register = compiler->AllocateRegister();
4543 intptr_t position_register = compiler->AllocateRegister();
4544
4545 const intptr_t registers_per_capture = 2;
4546 const intptr_t register_of_first_capture = 2;
4547 intptr_t register_count = capture_count_ * registers_per_capture;
4548 intptr_t register_start =
4549 register_of_first_capture + capture_from_ * registers_per_capture;
4550
4552 bool was_reading_backward = compiler->read_backward();
4553 compiler->set_read_backward(type() == LOOKBEHIND);
4554 Builder builder(is_positive(), on_success, stack_pointer_register,
4555 position_register, register_count, register_start);
4556 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
4557 result = builder.ForMatch(match);
4558 compiler->set_read_backward(was_reading_backward);
4559 return result;
4560}
4561
4563 RegExpNode* on_success) {
4564 return ToNode(body(), index(), compiler, on_success);
4565}
4566
4568 intptr_t index,
4570 RegExpNode* on_success) {
4571 ASSERT(body != nullptr);
4572 intptr_t start_reg = RegExpCapture::StartRegister(index);
4573 intptr_t end_reg = RegExpCapture::EndRegister(index);
4574 if (compiler->read_backward()) {
4575 intptr_t tmp = end_reg;
4576 end_reg = start_reg;
4577 start_reg = tmp;
4578 }
4579 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
4580 RegExpNode* body_node = body->ToNode(compiler, store_end);
4581 return ActionNode::StorePosition(start_reg, true, body_node);
4582}
4583
4585 RegExpNode* on_success) {
4586 ZoneGrowableArray<RegExpTree*>* children = nodes();
4587 RegExpNode* current = on_success;
4588 if (compiler->read_backward()) {
4589 for (intptr_t i = 0; i < children->length(); i++) {
4590 current = children->At(i)->ToNode(compiler, current);
4591 }
4592 } else {
4593 for (intptr_t i = children->length() - 1; i >= 0; i--) {
4594 current = children->At(i)->ToNode(compiler, current);
4595 }
4596 }
4597 return current;
4598}
4599
4600static void AddClass(const int32_t* elmv,
4601 intptr_t elmc,
4603 elmc--;
4604 ASSERT(elmv[elmc] == kRangeEndMarker);
4605 for (intptr_t i = 0; i < elmc; i += 2) {
4606 ASSERT(elmv[i] < elmv[i + 1]);
4607 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1));
4608 }
4609}
4610
4611static void AddClassNegated(const int32_t* elmv,
4612 intptr_t elmc,
4614 elmc--;
4615 ASSERT(elmv[elmc] == kRangeEndMarker);
4616 ASSERT(elmv[0] != 0x0000);
4617 ASSERT(elmv[elmc - 1] != Utf::kMaxCodePoint);
4618 uint16_t last = 0x0000;
4619 for (intptr_t i = 0; i < elmc; i += 2) {
4620 ASSERT(last <= elmv[i] - 1);
4621 ASSERT(elmv[i] < elmv[i + 1]);
4622 ranges->Add(CharacterRange(last, elmv[i] - 1));
4623 last = elmv[i + 1];
4624 }
4625 ranges->Add(CharacterRange(last, Utf::kMaxCodePoint));
4626}
4627
4630 bool add_unicode_case_equivalents) {
4631 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
4632 // See #sec-runtime-semantics-wordcharacters-abstract-operation
4633 // In case of unicode and ignore_case, we need to create the closure over
4634 // case equivalent characters before negating.
4637 AddClass(kWordRanges, kWordRangeCount, new_ranges);
4638 AddUnicodeCaseEquivalents(new_ranges);
4639 if (type == 'W') {
4642 CharacterRange::Negate(new_ranges, negated);
4643 new_ranges = negated;
4644 }
4645 ranges->AddArray(*new_ranges);
4646 return;
4647 }
4648 AddClassEscape(type, ranges);
4649}
4650
4653 switch (type) {
4654 case 's':
4656 break;
4657 case 'S':
4659 break;
4660 case 'w':
4662 break;
4663 case 'W':
4665 break;
4666 case 'd':
4668 break;
4669 case 'D':
4671 break;
4672 case '.':
4674 break;
4675 // This is not a character range as defined by the spec but a
4676 // convenient shorthand for a character class that matches any
4677 // character.
4678 case '*':
4680 break;
4681 // This is the set of characters matched by the $ and ^ symbols
4682 // in multiline mode.
4683 case 'n':
4685 break;
4686 default:
4687 UNREACHABLE();
4688 }
4689}
4690
4693 bool is_one_byte,
4694 Zone* zone) {
4696 int range_count = ranges->length();
4697 for (intptr_t i = 0; i < range_count; i++) {
4698 CharacterRange range = ranges->At(i);
4699 int32_t bottom = range.from();
4700 if (bottom > Utf16::kMaxCodeUnit) continue;
4701 int32_t top = Utils::Minimum(range.to(), Utf16::kMaxCodeUnit);
4702 // Nothing to be done for surrogates
4703 if (bottom >= Utf16::kLeadSurrogateStart &&
4705 continue;
4706 }
4707 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
4708 if (bottom > Symbols::kMaxOneCharCodeSymbol) continue;
4711 }
4712 }
4713
4717 if (top == bottom) {
4718 // If this is a singleton we just expand the one character.
4719 intptr_t length = jsregexp_uncanonicalize.get(bottom, '\0', chars);
4720 for (intptr_t i = 0; i < length; i++) {
4721 int32_t chr = chars[i];
4722 if (chr != bottom) {
4723 ranges->Add(CharacterRange::Singleton(chars[i]));
4724 }
4725 }
4726 } else {
4727 // If this is a range we expand the characters block by block,
4728 // expanding contiguous subranges (blocks) one at a time.
4729 // The approach is as follows. For a given start character we
4730 // look up the remainder of the block that contains it (represented
4731 // by the end point), for instance we find 'z' if the character
4732 // is 'c'. A block is characterized by the property
4733 // that all characters uncanonicalize in the same way, except that
4734 // each entry in the result is incremented by the distance from the first
4735 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A']
4736 // and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
4737 // Once we've found the end point we look up its uncanonicalization
4738 // and produce a range for each element. For instance for [c-f]
4739 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
4740 // add a range if it is not already contained in the input, so [c-f]
4741 // will be skipped but [C-F] will be added. If this range is not
4742 // completely contained in a block we do this for all the blocks
4743 // covered by the range (handling characters that is not in a block
4744 // as a "singleton block").
4746 intptr_t pos = bottom;
4747 while (pos <= top) {
4748 intptr_t length = jsregexp_canonrange.get(pos, '\0', range);
4749 int32_t block_end;
4750 if (length == 0) {
4751 block_end = pos;
4752 } else {
4753 ASSERT(length == 1);
4754 block_end = range[0];
4755 }
4756 intptr_t end = (block_end > top) ? top : block_end;
4757 length = jsregexp_uncanonicalize.get(block_end, '\0', range);
4758 for (intptr_t i = 0; i < length; i++) {
4759 int32_t c = range[i];
4760 int32_t range_from = c - (block_end - pos);
4761 int32_t range_to = c - (block_end - end);
4762 if (!(bottom <= range_from && range_to <= top)) {
4763 ranges->Add(CharacterRange(range_from, range_to));
4764 }
4765 }
4766 pos = end + 1;
4767 }
4768 }
4769 }
4770}
4771
4773 ASSERT(ranges != nullptr);
4774 intptr_t n = ranges->length();
4775 if (n <= 1) return true;
4776 intptr_t max = ranges->At(0).to();
4777 for (intptr_t i = 1; i < n; i++) {
4778 CharacterRange next_range = ranges->At(i);
4779 if (next_range.from() <= max + 1) return false;
4780 max = next_range.to();
4781 }
4782 return true;
4783}
4784
4786 if (ranges_ == nullptr) {
4787 ranges_ = new ZoneGrowableArray<CharacterRange>(2);
4788 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4789 }
4790 return ranges_;
4791}
4792
4793// Move a number of elements in a zone array to another position
4794// in the same array. Handles overlapping source and target areas.
4796 intptr_t from,
4797 intptr_t to,
4798 intptr_t count) {
4799 // Ranges are potentially overlapping.
4800 if (from < to) {
4801 for (intptr_t i = count - 1; i >= 0; i--) {
4802 (*list)[to + i] = list->At(from + i);
4803 }
4804 } else {
4805 for (intptr_t i = 0; i < count; i++) {
4806 (*list)[to + i] = list->At(from + i);
4807 }
4808 }
4809}
4810
4813 intptr_t count,
4814 CharacterRange insert) {
4815 // Inserts a range into list[0..count[, which must be sorted
4816 // by from value and non-overlapping and non-adjacent, using at most
4817 // list[0..count] for the result. Returns the number of resulting
4818 // canonicalized ranges. Inserting a range may collapse existing ranges into
4819 // fewer ranges, so the return value can be anything in the range 1..count+1.
4820 int32_t from = insert.from();
4821 int32_t to = insert.to();
4822 intptr_t start_pos = 0;
4823 intptr_t end_pos = count;
4824 for (intptr_t i = count - 1; i >= 0; i--) {
4825 CharacterRange current = list->At(i);
4826 if (current.from() > to + 1) {
4827 end_pos = i;
4828 } else if (current.to() + 1 < from) {
4829 start_pos = i + 1;
4830 break;
4831 }
4832 }
4833
4834 // Inserted range overlaps, or is adjacent to, ranges at positions
4835 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4836 // not affected by the insertion.
4837 // If start_pos == end_pos, the range must be inserted before start_pos.
4838 // if start_pos < end_pos, the entire range from start_pos to end_pos
4839 // must be merged with the insert range.
4840
4841 if (start_pos == end_pos) {
4842 // Insert between existing ranges at position start_pos.
4843 if (start_pos < count) {
4844 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
4845 }
4846 (*list)[start_pos] = insert;
4847 return count + 1;
4848 }
4849 if (start_pos + 1 == end_pos) {
4850 // Replace single existing range at position start_pos.
4851 CharacterRange to_replace = list->At(start_pos);
4852 intptr_t new_from = Utils::Minimum(to_replace.from(), from);
4853 intptr_t new_to = Utils::Maximum(to_replace.to(), to);
4854 (*list)[start_pos] = CharacterRange(new_from, new_to);
4855 return count;
4856 }
4857 // Replace a number of existing ranges from start_pos to end_pos - 1.
4858 // Move the remaining ranges down.
4859
4860 intptr_t new_from = Utils::Minimum(list->At(start_pos).from(), from);
4861 intptr_t new_to = Utils::Maximum(list->At(end_pos - 1).to(), to);
4862 if (end_pos < count) {
4863 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
4864 }
4865 (*list)[start_pos] = CharacterRange(new_from, new_to);
4866 return count - (end_pos - start_pos) + 1;
4867}
4868
4870 // Special/default classes are always considered canonical. The result
4871 // of calling ranges() will be sorted.
4872 if (ranges_ == nullptr) return;
4874}
4875
4877 ZoneGrowableArray<CharacterRange>* character_ranges) {
4878 if (character_ranges->length() <= 1) return;
4879 // Check whether ranges are already canonical (increasing, non-overlapping,
4880 // non-adjacent).
4881 intptr_t n = character_ranges->length();
4882 intptr_t max = character_ranges->At(0).to();
4883 intptr_t i = 1;
4884 while (i < n) {
4885 CharacterRange current = character_ranges->At(i);
4886 if (current.from() <= max + 1) {
4887 break;
4888 }
4889 max = current.to();
4890 i++;
4891 }
4892 // Canonical until the i'th range. If that's all of them, we are done.
4893 if (i == n) return;
4894
4895 // The ranges at index i and forward are not canonicalized. Make them so by
4896 // doing the equivalent of insertion sort (inserting each into the previous
4897 // list, in order).
4898 // Notice that inserting a range can reduce the number of ranges in the
4899 // result due to combining of adjacent and overlapping ranges.
4900 intptr_t read = i; // Range to insert.
4901 intptr_t num_canonical = i; // Length of canonicalized part of list.
4902 do {
4903 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
4904 character_ranges->At(read));
4905 read++;
4906 } while (read < n);
4907 character_ranges->TruncateTo(num_canonical);
4908
4909 ASSERT(CharacterRange::IsCanonical(character_ranges));
4910}
4911
4913 ZoneGrowableArray<CharacterRange>* negated_ranges) {
4915 ASSERT(negated_ranges->length() == 0);
4916 intptr_t range_count = ranges->length();
4917 uint32_t from = 0;
4918 intptr_t i = 0;
4919 if (range_count > 0 && ranges->At(0).from() == 0) {
4920 from = ranges->At(0).to();
4921 i = 1;
4922 }
4923 while (i < range_count) {
4924 CharacterRange range = ranges->At(i);
4925 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4926 from = range.to();
4927 i++;
4928 }
4929 if (from < Utf::kMaxCodePoint) {
4930 negated_ranges->Add(CharacterRange(from + 1, Utf::kMaxCodePoint));
4931 }
4932}
4933
4934// -------------------------------------------------------------------
4935// Splay tree
4936
4937// Workaround for the fact that ZoneGrowableArray does not have contains().
4938static bool ArrayContains(ZoneGrowableArray<unsigned>* array, unsigned value) {
4939 for (intptr_t i = 0; i < array->length(); i++) {
4940 if (array->At(i) == value) {
4941 return true;
4942 }
4943 }
4944 return false;
4945}
4946
4947OutSet* OutSet::Extend(unsigned value, Zone* zone) {
4948 if (Get(value)) return this;
4949 if (successors() != nullptr) {
4950 for (int i = 0; i < successors()->length(); i++) {
4951 OutSet* successor = successors()->At(i);
4952 if (successor->Get(value)) return successor;
4953 }
4954 } else {
4955 successors_ = new (zone) ZoneGrowableArray<OutSet*>(2);
4956 }
4957 OutSet* result = new (zone) OutSet(first_, remaining_);
4958 result->Set(value, zone);
4959 successors()->Add(result);
4960 return result;
4961}
4962
4963void OutSet::Set(unsigned value, Zone* zone) {
4964 if (value < kFirstLimit) {
4965 first_ |= (1 << value);
4966 } else {
4967 if (remaining_ == nullptr)
4968 remaining_ = new (zone) ZoneGrowableArray<unsigned>(1);
4969
4970 bool remaining_contains_value = ArrayContains(remaining_, value);
4971 if (remaining_->is_empty() || !remaining_contains_value) {
4972 remaining_->Add(value);
4973 }
4974 }
4975}
4976
4977bool OutSet::Get(unsigned value) const {
4978 if (value < kFirstLimit) {
4979 return (first_ & (1 << value)) != 0;
4980 } else if (remaining_ == nullptr) {
4981 return false;
4982 } else {
4983 return ArrayContains(remaining_, value);
4984 }
4985}
4986
4988
4990 int32_t value,
4991 Zone* zone) {
4992 CharacterRange current = full_range;
4993 if (tree()->is_empty()) {
4994 // If this is the first range we just insert into the table.
4996 bool inserted = tree()->Insert(current.from(), &loc);
4997 ASSERT(inserted);
4998 USE(inserted);
4999 loc.set_value(
5000 Entry(current.from(), current.to(), empty()->Extend(value, zone)));
5001 return;
5002 }
5003 // First see if there is a range to the left of this one that
5004 // overlaps.
5006 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5007 Entry* entry = &loc.value();
5008 // If we've found a range that overlaps with this one, and it
5009 // starts strictly to the left of this one, we have to fix it
5010 // because the following code only handles ranges that start on
5011 // or after the start point of the range we're adding.
5012 if (entry->from() < current.from() && entry->to() >= current.from()) {
5013 // Snap the overlapping range in half around the start point of
5014 // the range we're adding.
5015 CharacterRange left =
5016 CharacterRange::Range(entry->from(), current.from() - 1);
5017 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
5018 // The left part of the overlapping range doesn't overlap.
5019 // Truncate the whole entry to be just the left part.
5020 entry->set_to(left.to());
5021 // The right part is the one that overlaps. We add this part
5022 // to the map and let the next step deal with merging it with
5023 // the range we're adding.
5025 bool inserted = tree()->Insert(right.from(), &loc);
5026 ASSERT(inserted);
5027 USE(inserted);
5028 loc.set_value(Entry(right.from(), right.to(), entry->out_set()));
5029 }
5030 }
5031 while (current.is_valid()) {
5032 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5033 (loc.value().from() <= current.to()) &&
5034 (loc.value().to() >= current.from())) {
5035 Entry* entry = &loc.value();
5036 // We have overlap. If there is space between the start point of
5037 // the range we're adding and where the overlapping range starts
5038 // then we have to add a range covering just that space.
5039 if (current.from() < entry->from()) {
5041 bool inserted = tree()->Insert(current.from(), &ins);
5042 ASSERT(inserted);
5043 USE(inserted);
5044 ins.set_value(Entry(current.from(), entry->from() - 1,
5045 empty()->Extend(value, zone)));
5046 current.set_from(entry->from());
5047 }
5048 ASSERT(current.from() == entry->from());
5049 // If the overlapping range extends beyond the one we want to add
5050 // we have to snap the right part off and add it separately.
5051 if (entry->to() > current.to()) {
5053 bool inserted = tree()->Insert(current.to() + 1, &ins);
5054 ASSERT(inserted);
5055 USE(inserted);
5056 ins.set_value(Entry(current.to() + 1, entry->to(), entry->out_set()));
5057 entry->set_to(current.to());
5058 }
5059 ASSERT(entry->to() <= current.to());
5060 // The overlapping range is now completely contained by the range
5061 // we're adding so we can just update it and move the start point
5062 // of the range we're adding just past it.
5063 entry->AddValue(value, zone);
5064 ASSERT(entry->to() + 1 > current.from());
5065 current.set_from(entry->to() + 1);
5066 } else {
5067 // There is no overlap so we can just add the range
5069 bool inserted = tree()->Insert(current.from(), &ins);
5070 ASSERT(inserted);
5071 USE(inserted);
5072 ins.set_value(
5073 Entry(current.from(), current.to(), empty()->Extend(value, zone)));
5074 break;
5075 }
5076 }
5077}
5078
5081 if (!tree()->FindGreatestLessThan(value, &loc)) return empty();
5082 Entry* entry = &loc.value();
5083 if (value <= entry->to())
5084 return entry->out_set();
5085 else
5086 return empty();
5087}
5088
5089// -------------------------------------------------------------------
5090// Analysis
5091
5093 if (that->info()->been_analyzed || that->info()->being_analyzed) return;
5094 that->info()->being_analyzed = true;
5095 that->Accept(this);
5096 that->info()->being_analyzed = false;
5097 that->info()->been_analyzed = true;
5098}
5099
5100void Analysis::VisitEnd(EndNode* that) {
5101 // nothing to do
5102}
5103
5105 intptr_t element_count = elements()->length();
5106 // Set up the offsets of the elements relative to the start. This is a fixed
5107 // quantity since a TextNode can only contain fixed-width things.
5108 intptr_t cp_offset = 0;
5109 for (intptr_t i = 0; i < element_count; i++) {
5110 TextElement& elm = (*elements())[i];
5111 elm.set_cp_offset(cp_offset);
5112 cp_offset += elm.length();
5113 }
5114}
5115
5116void Analysis::VisitText(TextNode* that) {
5117 that->MakeCaseIndependent(is_one_byte_);
5118 EnsureAnalyzed(that->on_success());
5119 if (!has_failed()) {
5120 that->CalculateOffsets();
5121 }
5122}
5123
5124void Analysis::VisitAction(ActionNode* that) {
5125 RegExpNode* target = that->on_success();
5126 EnsureAnalyzed(target);
5127 if (!has_failed()) {
5128 // If the next node is interested in what it follows then this node
5129 // has to be interested too so it can pass the information on.
5130 that->info()->AddFromFollowing(target->info());
5131 }
5132}
5133
5134void Analysis::VisitChoice(ChoiceNode* that) {
5135 NodeInfo* info = that->info();
5136 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
5137 RegExpNode* node = (*that->alternatives())[i].node();
5138 EnsureAnalyzed(node);
5139 if (has_failed()) return;
5140 // Anything the following nodes need to know has to be known by
5141 // this node also, so it can pass it on.
5142 info->AddFromFollowing(node->info());
5143 }
5144}
5145
5147 NodeInfo* info = that->info();
5148 for (intptr_t i = 0; i < that->alternatives()->length(); i++) {
5149 RegExpNode* node = (*that->alternatives())[i].node();
5150 if (node != that->loop_node()) {
5151 EnsureAnalyzed(node);
5152 if (has_failed()) return;
5153 info->AddFromFollowing(node->info());
5154 }
5155 }
5156 // Check the loop last since it may need the value of this node
5157 // to get a correct result.
5158 EnsureAnalyzed(that->loop_node());
5159 if (!has_failed()) {
5160 info->AddFromFollowing(that->loop_node()->info());
5161 }
5162}
5163
5164void Analysis::VisitBackReference(BackReferenceNode* that) {
5165 EnsureAnalyzed(that->on_success());
5166}
5167
5168void Analysis::VisitAssertion(AssertionNode* that) {
5169 EnsureAnalyzed(that->on_success());
5170}
5171
5173 intptr_t budget,
5175 bool not_at_start) {
5176 // Working out the set of characters that a backreference can match is too
5177 // hard, so we just say that any character can match.
5178 bm->SetRest(offset);
5179 SaveBMInfo(bm, not_at_start, offset);
5180}
5181
5184
5186 intptr_t budget,
5188 bool not_at_start) {
5189 ZoneGrowableArray<GuardedAlternative>* alts = alternatives();
5190 budget = (budget - 1) / alts->length();
5191 for (intptr_t i = 0; i < alts->length(); i++) {
5192 GuardedAlternative& alt = (*alts)[i];
5193 if (alt.guards() != nullptr && alt.guards()->length() != 0) {
5194 bm->SetRest(offset); // Give up trying to fill in info.
5195 SaveBMInfo(bm, not_at_start, offset);
5196 return;
5197 }
5198 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5199 }
5200 SaveBMInfo(bm, not_at_start, offset);
5201}
5202
5203void TextNode::FillInBMInfo(intptr_t initial_offset,
5204 intptr_t budget,
5206 bool not_at_start) {
5207 if (initial_offset >= bm->length()) return;
5208 intptr_t offset = initial_offset;
5209 intptr_t max_char = bm->max_char();
5210 for (intptr_t i = 0; i < elements()->length(); i++) {
5211 if (offset >= bm->length()) {
5212 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5213 return;
5214 }
5215 TextElement text = elements()->At(i);
5216 if (text.text_type() == TextElement::ATOM) {
5217 RegExpAtom* atom = text.atom();
5218 for (intptr_t j = 0; j < atom->length(); j++, offset++) {
5219 if (offset >= bm->length()) {
5220 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5221 return;
5222 }
5223 uint16_t character = atom->data()->At(j);
5224 if (atom->flags().IgnoreCase()) {
5228 chars);
5229 for (intptr_t j = 0; j < length; j++) {
5230 bm->Set(offset, chars[j]);
5231 }
5232 } else {
5233 if (character <= max_char) bm->Set(offset, character);
5234 }
5235 }
5236 } else {
5237 ASSERT(text.text_type() == TextElement::CHAR_CLASS);
5238 RegExpCharacterClass* char_class = text.char_class();
5239 ZoneGrowableArray<CharacterRange>* ranges = char_class->ranges();
5240 if (char_class->is_negated()) {
5241 bm->SetAll(offset);
5242 } else {
5243 for (intptr_t k = 0; k < ranges->length(); k++) {
5244 const CharacterRange& range = ranges->At(k);
5245 if (range.from() > max_char) continue;
5246 intptr_t to =
5247 Utils::Minimum(max_char, static_cast<intptr_t>(range.to()));
5248 bm->SetInterval(offset, Interval(range.from(), to));
5249 }
5250 }
5251 offset++;
5252 }
5253 }
5254 if (offset >= bm->length()) {
5255 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5256 return;
5257 }
5258 on_success()->FillInBMInfo(offset, budget - 1, bm,
5259 true); // Not at start after a text node.
5260 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5261}
5262
5264 RegExpNode* on_success,
5266 // If the regexp matching starts within a surrogate pair, step back
5267 // to the lead surrogate and start matching from there.
5268 ASSERT(!compiler->read_backward());
5269 Zone* zone = compiler->zone();
5270
5271 auto lead_surrogates = CharacterRange::List(
5274 auto trail_surrogates = CharacterRange::List(
5277
5278 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
5279
5280 int stack_register = compiler->UnicodeLookaroundStackRegister();
5281 int position_register = compiler->UnicodeLookaroundPositionRegister();
5283 lead_surrogates, /*read_backward=*/true, on_success, flags);
5284 RegExpLookaround::Builder builder(/*is_positive=*/true, step_back,
5285 stack_register, position_register);
5287 trail_surrogates, /*read_backward=*/false, builder.on_match_success(),
5288 flags);
5289
5290 optional_step_back->AddAlternative(
5291 GuardedAlternative(builder.ForMatch(match_trail)));
5292 optional_step_back->AddAlternative(GuardedAlternative(on_success));
5293
5294 return optional_step_back;
5295}
5296
5297#if !defined(DART_PRECOMPILED_RUNTIME)
5300 const ParsedFunction* parsed_function,
5301 const ZoneGrowableArray<const ICData*>& ic_data_array,
5302 intptr_t osr_id) {
5303 ASSERT(!FLAG_interpret_irregexp);
5304 Zone* zone = Thread::Current()->zone();
5305
5306 const Function& function = parsed_function->function();
5307 const intptr_t specialization_cid = function.string_specialization_cid();
5308 const bool is_sticky = function.is_sticky_specialization();
5309 const bool is_one_byte = (specialization_cid == kOneByteStringCid);
5310 RegExp& regexp = RegExp::Handle(zone, function.regexp());
5311 const String& pattern = String::Handle(zone, regexp.pattern());
5312
5313 ASSERT(!regexp.IsNull());
5314 ASSERT(!pattern.IsNull());
5315
5316 const bool is_global = regexp.flags().IsGlobal();
5317 const bool is_unicode = regexp.flags().IsUnicode();
5318
5319 RegExpCompiler compiler(data->capture_count, is_one_byte);
5320
5321 // TODO(zerny): Frequency sampling is currently disabled because of several
5322 // issues. We do not want to store subject strings in the regexp object since
5323 // they might be long and we should not prevent their garbage collection.
5324 // Passing them to this function explicitly does not help, since we must
5325 // generate exactly the same IR for both the unoptimizing and optimizing
5326 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5327 // An option would be to store sampling results in the regexp object, but
5328 // I'm not sure the performance gains are relevant enough.
5329
5330 // Wrap the body of the regexp in capture #0.
5331 RegExpNode* captured_body =
5332 RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept());
5333
5334 RegExpNode* node = captured_body;
5335 const bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5336 const bool is_start_anchored = data->tree->IsAnchoredAtStart();
5337 intptr_t max_length = data->tree->max_match();
5338 if (!is_start_anchored && !is_sticky) {
5339 // Add a .*? at the beginning, outside the body capture, unless
5340 // this expression is anchored at the beginning or is sticky.
5342 0, RegExpTree::kInfinity, false,
5343 new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler,
5344 captured_body, data->contains_anchor);
5345
5346 if (data->contains_anchor) {
5347 // Unroll loop once, to take care of the case that might start
5348 // at the start of input.
5349 ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
5350 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5351 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
5352 new (zone) RegExpCharacterClass('*', RegExpFlags()),
5353 /*read_backwards=*/false, loop_node)));
5354 node = first_step_node;
5355 } else {
5356 node = loop_node;
5357 }
5358 }
5359 if (is_one_byte) {
5361 // Do it again to propagate the new nodes to places where they were not
5362 // put because they had not been calculated yet.
5363 if (node != nullptr) {
5365 }
5366 } else if (is_unicode && (is_global || is_sticky)) {
5367 node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags());
5368 }
5369
5370 if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
5371 data->node = node;
5372 Analysis analysis(is_one_byte);
5373 analysis.EnsureAnalyzed(node);
5374 if (analysis.has_failed()) {
5375 const char* error_message = analysis.error_message();
5376 return CompilationResult(error_message);
5377 }
5378
5379 // Native regexp implementation.
5380
5381 IRRegExpMacroAssembler* macro_assembler = new (zone)
5382 IRRegExpMacroAssembler(specialization_cid, data->capture_count,
5383 parsed_function, ic_data_array, osr_id, zone);
5384
5385 // Inserted here, instead of in Assembler, because it depends on information
5386 // in the AST that isn't replicated in the Node structure.
5387 const intptr_t kMaxBacksearchLimit = 1024;
5388 if (is_end_anchored && !is_start_anchored && !is_sticky &&
5389 max_length < kMaxBacksearchLimit) {
5390 macro_assembler->SetCurrentPositionFromEnd(max_length);
5391 }
5392
5393 if (is_global) {
5395 if (data->tree->min_match() > 0) {
5397 } else if (is_unicode) {
5399 }
5400 macro_assembler->set_global_mode(mode);
5401 }
5402
5404 compiler.Assemble(macro_assembler, node, data->capture_count, pattern);
5405
5406 if (FLAG_trace_irregexp) {
5407 macro_assembler->PrintBlocks();
5408 }
5409
5410 return result;
5411}
5412#endif // !defined(DART_PRECOMPILED_RUNTIME)
5413
5416 const RegExp& regexp,
5417 bool is_one_byte,
5418 bool is_sticky,
5419 Zone* zone) {
5420 ASSERT(FLAG_interpret_irregexp);
5421 const String& pattern = String::Handle(zone, regexp.pattern());
5422
5423 ASSERT(!regexp.IsNull());
5424 ASSERT(!pattern.IsNull());
5425
5426 const bool is_global = regexp.flags().IsGlobal();
5427 const bool is_unicode = regexp.flags().IsUnicode();
5428
5429 RegExpCompiler compiler(data->capture_count, is_one_byte);
5430
5431 // TODO(zerny): Frequency sampling is currently disabled because of several
5432 // issues. We do not want to store subject strings in the regexp object since
5433 // they might be long and we should not prevent their garbage collection.
5434 // Passing them to this function explicitly does not help, since we must
5435 // generate exactly the same IR for both the unoptimizing and optimizing
5436 // pipelines (otherwise it gets confused when i.e. deopt id's differ).
5437 // An option would be to store sampling results in the regexp object, but
5438 // I'm not sure the performance gains are relevant enough.
5439
5440 // Wrap the body of the regexp in capture #0.
5441 RegExpNode* captured_body =
5442 RegExpCapture::ToNode(data->tree, 0, &compiler, compiler.accept());
5443
5444 RegExpNode* node = captured_body;
5445 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
5446 bool is_start_anchored = data->tree->IsAnchoredAtStart();
5447 intptr_t max_length = data->tree->max_match();
5448 if (!is_start_anchored && !is_sticky) {
5449 // Add a .*? at the beginning, outside the body capture, unless
5450 // this expression is anchored at the beginning.
5452 0, RegExpTree::kInfinity, false,
5453 new (zone) RegExpCharacterClass('*', RegExpFlags()), &compiler,
5454 captured_body, data->contains_anchor);
5455
5456 if (data->contains_anchor) {
5457 // Unroll loop once, to take care of the case that might start
5458 // at the start of input.
5459 ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
5460 first_step_node->AddAlternative(GuardedAlternative(captured_body));
5461 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
5462 new (zone) RegExpCharacterClass('*', RegExpFlags()),
5463 /*read_backwards=*/false, loop_node)));
5464 node = first_step_node;
5465 } else {
5466 node = loop_node;
5467 }
5468 }
5469 if (is_one_byte) {
5471 // Do it again to propagate the new nodes to places where they were not
5472 // put because they had not been calculated yet.
5473 if (node != nullptr) {
5475 }
5476 } else if (is_unicode && (is_global || is_sticky)) {
5477 node = OptionallyStepBackToLeadSurrogate(&compiler, node, regexp.flags());
5478 }
5479
5480 if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
5481 data->node = node;
5482 Analysis analysis(is_one_byte);
5483 analysis.EnsureAnalyzed(node);
5484 if (analysis.has_failed()) {
5485 const char* error_message = analysis.error_message();
5486 return CompilationResult(error_message);
5487 }
5488
5489 // Bytecode regexp implementation.
5490
5492 BytecodeRegExpMacroAssembler* macro_assembler =
5493 new (zone) BytecodeRegExpMacroAssembler(&buffer, zone);
5494
5495 // Inserted here, instead of in Assembler, because it depends on information
5496 // in the AST that isn't replicated in the Node structure.
5497 const intptr_t kMaxBacksearchLimit = 1024;
5498 if (is_end_anchored && !is_start_anchored && !is_sticky &&
5499 max_length < kMaxBacksearchLimit) {
5500 macro_assembler->SetCurrentPositionFromEnd(max_length);
5501 }
5502
5503 if (is_global) {
5505 if (data->tree->min_match() > 0) {
5507 } else if (is_unicode) {
5509 }
5510 macro_assembler->set_global_mode(mode);
5511 }
5512
5514 compiler.Assemble(macro_assembler, node, data->capture_count, pattern);
5515
5516 if (FLAG_trace_irregexp) {
5517 macro_assembler->PrintBlocks();
5518 }
5519
5520 return result;
5521}
5522
5524 Zone* zone,
5525 const RegExp& regexp,
5526 intptr_t specialization_cid,
5527 bool sticky,
5528 const Object& owner) {
5529 const intptr_t kParamCount = RegExpMacroAssembler::kParamCount;
5530
5531 const FunctionType& signature =
5533 const String& pattern = String::Handle(zone, regexp.pattern());
5534 Function& fn =
5535 Function::Handle(zone, Function::New(signature, pattern,
5536 UntaggedFunction::kIrregexpFunction,
5537 true, // Static.
5538 false, // Not const.
5539 false, // Not abstract.
5540 false, // Not external.
5541 false, // Not native.
5543
5544 // TODO(zerny): Share these arrays between all irregexp functions.
5545 // TODO(regis): Better, share a common signature.
5546 signature.set_num_fixed_parameters(kParamCount);
5547 signature.set_parameter_types(
5548 Array::Handle(zone, Array::New(kParamCount, Heap::kOld)));
5549 fn.CreateNameArray();
5551 Object::dynamic_type());
5553 Symbols::This());
5555 Object::dynamic_type());
5557 Symbols::string_param());
5559 Object::dynamic_type());
5561 Symbols::start_index_param());
5562 signature.set_result_type(Type::Handle(zone, Type::ArrayType()));
5563
5564 // Cache the result.
5565 regexp.set_function(specialization_cid, sticky, fn);
5566
5567 fn.SetRegExpData(regexp, specialization_cid, sticky);
5568 fn.set_is_debuggable(false);
5569
5570 // The function is compiled lazily during the first call.
5571}
5572
5574 const String& pattern,
5576 Zone* zone = thread->zone();
5577 const RegExp& regexp = RegExp::Handle(RegExp::New(zone));
5578
5579 regexp.set_pattern(pattern);
5580 regexp.set_flags(flags);
5581
5582 // TODO(zerny): We might want to use normal string searching algorithms
5583 // for simple patterns.
5584 regexp.set_is_complex();
5585 regexp.set_is_global(); // All dart regexps are global.
5586
5587 if (!FLAG_interpret_irregexp) {
5588 const Library& lib = Library::Handle(zone, Library::CoreLibrary());
5589 const Class& owner =
5590 Class::Handle(zone, lib.LookupClass(Symbols::RegExp()));
5591
5592 for (intptr_t cid = kOneByteStringCid; cid <= kTwoByteStringCid; cid++) {
5593 CreateSpecializedFunction(thread, zone, regexp, cid, /*sticky=*/false,
5594 owner);
5595 CreateSpecializedFunction(thread, zone, regexp, cid, /*sticky=*/true,
5596 owner);
5597 }
5598 }
5599
5600 return regexp.ptr();
5601}
5602
5603} // namespace dart
static const int outset
Definition: BlurTest.cpp:58
static void fail(const SkString &err)
Definition: DM.cpp:234
static void info(const char *fmt,...) SK_PRINTF_LIKE(1
Definition: DM.cpp:213
static constexpr uint64_t kBits
Definition: DrawPass.cpp:54
int count
Definition: FontMgrTest.cpp:50
static const int points[]
SkPoint pos
static void is_empty(skiatest::Reporter *reporter, const SkPath &p)
static float next(float f)
static const char marker[]
#define check(reporter, ref, unref, make, kill)
Definition: RefCntTest.cpp:85
static bool read(SkStream *stream, void *buffer, size_t amount)
static bool skip(SkStream *stream, size_t amount)
static bool ok(int result)
#define UNREACHABLE()
Definition: assert.h:248
GLenum type
intptr_t clear_register_count
Definition: regexp.h:631
static ActionNode * BeginSubmatch(intptr_t stack_pointer_reg, intptr_t position_reg, RegExpNode *on_success)
Definition: regexp.cc:792
static ActionNode * EmptyMatchCheck(intptr_t start_register, intptr_t repetition_register, intptr_t repetition_limit, RegExpNode *on_success)
Definition: regexp.cc:816
@ POSITIVE_SUBMATCH_SUCCESS
Definition: regexp.h:569
@ INCREMENT_REGISTER
Definition: regexp.h:566
static ActionNode * SetRegister(intptr_t reg, intptr_t val, RegExpNode *on_success)
Definition: regexp.cc:756
static ActionNode * PositiveSubmatchSuccess(intptr_t stack_pointer_reg, intptr_t restore_reg, intptr_t clear_capture_count, intptr_t clear_capture_from, RegExpNode *on_success)
Definition: regexp.cc:802
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition: regexp.cc:1473
static ActionNode * IncrementRegister(intptr_t reg, RegExpNode *on_success)
Definition: regexp.cc:766
static ActionNode * ClearCaptures(Interval range, RegExpNode *on_success)
Definition: regexp.cc:784
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:3407
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: regexp.cc:1481
static ActionNode * StorePosition(intptr_t reg, bool is_capture, RegExpNode *on_success)
Definition: regexp.cc:774
AlternativeGenerationList(intptr_t count)
Definition: regexp.cc:2737
AlternativeGeneration * at(intptr_t i)
Definition: regexp.cc:2744
void EnsureAnalyzed(RegExpNode *node)
Definition: regexp.cc:5092
const char * error_message()
Definition: regexp.h:1417
virtual void VisitLoopChoice(LoopChoiceNode *that)
Definition: regexp.cc:5146
bool has_failed()
Definition: regexp.h:1416
static ArrayPtr New(intptr_t len, Heap::Space space=Heap::kNew)
Definition: object.h:10959
static AssertionNode * AfterNewline(RegExpNode *on_success)
Definition: regexp.h:742
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition: regexp.cc:1493
static AssertionNode * AtNonBoundary(RegExpNode *on_success)
Definition: regexp.h:739
static AssertionNode * AtStart(RegExpNode *on_success)
Definition: regexp.h:733
static AssertionNode * AtEnd(RegExpNode *on_success)
Definition: regexp.h:730
static AssertionNode * AtBoundary(RegExpNode *on_success)
Definition: regexp.h:736
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:2316
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t filled_in, bool not_at_start)
Definition: regexp.cc:2304
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: regexp.cc:1506
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: regexp.cc:5172
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:3527
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t recursion_depth, bool not_at_start)
Definition: regexp.cc:1516
void TruncateTo(intptr_t length)
void AddArray(const BaseGrowableArray< T, B, Allocator > &src)
void Add(const T &value)
const T & At(intptr_t index) const
intptr_t length() const
Convenience wrapper around a BlockEntryInstr pointer.
bool is_linked() const
void SetAll(intptr_t map_number)
Definition: regexp.h:1162
void SetInterval(intptr_t map_number, const Interval &interval)
Definition: regexp.h:1152
void Set(intptr_t map_number, intptr_t character)
Definition: regexp.h:1146
BoyerMooreLookahead(intptr_t length, RegExpCompiler *compiler, Zone *Zone)
Definition: regexp.cc:2825
intptr_t Count(intptr_t map_number)
Definition: regexp.h:1140
void SetRest(intptr_t from_map)
Definition: regexp.h:1164
void EmitSkipInstructions(RegExpMacroAssembler *masm)
Definition: regexp.cc:2948
void Set(intptr_t character)
Definition: regexp.cc:2788
static constexpr intptr_t kMapSize
Definition: regexp.h:1112
void SetInterval(const Interval &interval)
Definition: regexp.cc:2792
virtual void SetCurrentPositionFromEnd(intptr_t by)
static bool IsCanonical(ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:4772
static void AddClassEscape(uint16_t type, ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:4651
static CharacterRange Range(int32_t from, int32_t to)
Definition: regexp.h:40
static CharacterRange Everything()
Definition: regexp.h:44
bool Contains(int32_t i) const
Definition: regexp.h:53
static CharacterRange Singleton(int32_t value)
Definition: regexp.h:37
int32_t to() const
Definition: regexp.h:56
void set_from(int32_t value)
Definition: regexp.h:55
int32_t from() const
Definition: regexp.h:54
static void Canonicalize(ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:4876
bool is_valid() const
Definition: regexp.h:58
static void AddCaseEquivalents(ZoneGrowableArray< CharacterRange > *ranges, bool is_one_byte, Zone *zone)
Definition: regexp.cc:4691
static void Negate(ZoneGrowableArray< CharacterRange > *src, ZoneGrowableArray< CharacterRange > *dst)
Definition: regexp.cc:4912
static ZoneGrowableArray< CharacterRange > * List(Zone *zone, CharacterRange range)
Definition: regexp.h:47
void set_standard_set_type(uint16_t special_set_type)
Definition: regexp_ast.h:140
ZoneGrowableArray< CharacterRange > * ranges()
Definition: regexp.cc:4785
void Canonicalize()
Definition: regexp.cc:4869
virtual bool try_to_emit_quick_check_for_alternative(bool is_first)
Definition: regexp.h:928
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition: regexp.cc:1582
ZoneGrowableArray< GuardedAlternative > * alternatives_
Definition: regexp.h:937
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition: regexp.cc:2049
void set_not_at_start()
Definition: regexp.h:926
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: regexp.cc:5185
intptr_t EatsAtLeastHelper(intptr_t still_to_find, intptr_t budget, RegExpNode *ignore_this_node, bool not_at_start)
Definition: regexp.cc:1557
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)
Definition: regexp.cc:2143
void AddAlternative(GuardedAlternative node)
Definition: regexp.h:903
intptr_t GreedyLoopTextLengthForAlternative(const GuardedAlternative *alternative)
Definition: regexp.cc:2632
ZoneGrowableArray< GuardedAlternative > * alternatives()
Definition: regexp.h:904
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:3127
bool not_at_start()
Definition: regexp.h:925
static const int32_t kNoKey
Definition: regexp.h:149
void set_to(int32_t value)
Definition: regexp.h:133
OutSet * out_set()
Definition: regexp.h:137
void AddValue(int value, Zone *zone)
Definition: regexp.h:134
OutSet * Get(int32_t value)
Definition: regexp.cc:5079
void AddRange(CharacterRange range, int32_t value, Zone *zone)
Definition: regexp.cc:4989
void ForEach(Callback *callback)
Definition: regexp.h:166
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:728
intptr_t Frequency(intptr_t in_character)
Definition: regexp.cc:261
void CountCharacter(intptr_t character)
Definition: regexp.cc:253
void set_result_type(const AbstractType &value) const
Definition: object.cc:8575
void SetParameterTypeAt(intptr_t index, const AbstractType &value) const
Definition: object.cc:8590
void set_num_fixed_parameters(intptr_t value) const
Definition: object.cc:11608
void set_parameter_types(const Array &value) const
Definition: object.cc:8597
static FunctionTypePtr New(intptr_t num_parent_type_arguments=0, Nullability nullability=Nullability::kNonNullable, Heap::Space space=Heap::kOld)
Definition: object.cc:11631
void CreateNameArray(Heap::Space space=Heap::kOld) const
Definition: object.cc:8677
static FunctionPtr New(const FunctionType &signature, const String &name, UntaggedFunction::Kind kind, bool is_static, bool is_const, bool is_abstract, bool is_external, bool is_native, const Object &owner, TokenPosition token_pos, Heap::Space space=Heap::kOld)
Definition: object.cc:10243
void SetParameterNameAt(intptr_t index, const String &value) const
Definition: object.cc:8623
void SetRegExpData(const RegExp &regexp, intptr_t string_specialization_cid, bool sticky) const
Definition: object.cc:8492
GreedyLoopState(bool not_at_start)
Definition: regexp.cc:3091
Trace * counter_backtrack_trace()
Definition: regexp.h:1366
BlockLabel * label()
Definition: regexp.h:1365
intptr_t reg()
Definition: regexp.h:866
intptr_t value()
Definition: regexp.h:868
Relation op()
Definition: regexp.h:867
RegExpNode * node() const
Definition: regexp.h:881
ZoneGrowableArray< Guard * > * guards() const
Definition: regexp.h:883
void AddGuard(Guard *guard, Zone *zone)
Definition: regexp.cc:751
@ kOld
Definition: heap.h:39
virtual void SetCurrentPositionFromEnd(intptr_t by)
bool Contains(intptr_t value) const
Definition: regexp.h:524
intptr_t to() const
Definition: regexp.h:529
intptr_t from() const
Definition: regexp.h:528
bool is_empty() const
Definition: regexp.h:527
static uint16_t TryConvertToLatin1(uint16_t c)
Definition: unicode.h:238
static LibraryPtr CoreLibrary()
Definition: object.cc:14787
ClassPtr LookupClass(const String &name) const
Definition: object.cc:14105
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition: regexp.cc:1576
virtual void Accept(NodeVisitor *visitor)
Definition: regexp.cc:835
void AddContinueAlternative(GuardedAlternative alt)
Definition: regexp.cc:2660
RegExpNode * loop_node()
Definition: regexp.h:1034
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:2666
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: regexp.cc:2130
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)
Definition: regexp.cc:2120
void AddLoopAlternative(GuardedAlternative alt)
Definition: regexp.cc:2654
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition: regexp.cc:2033
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition: regexp.cc:1536
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition: regexp.cc:2099
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)
Definition: regexp.cc:1546
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:702
virtual void VisitLoopChoice(LoopChoiceNode *that)
Definition: regexp.h:1390
static void static void PrintErr(const char *format,...) PRINTF_ATTRIBUTE(1
ObjectPtr ptr() const
Definition: object.h:332
bool IsNull() const
Definition: object.h:363
static Object & Handle()
Definition: object.h:407
static Object & ZoneHandle()
Definition: object.h:419
OutSet * Extend(unsigned value, Zone *zone)
Definition: regexp.cc:4947
bool Get(unsigned value) const
Definition: regexp.cc:4977
const Function & function() const
Definition: parser.h:73
intptr_t characters()
Definition: regexp.h:359
Position * positions(intptr_t index)
Definition: regexp.h:361
bool Rationalize(bool one_byte)
Definition: regexp.cc:1598
void Advance(intptr_t by, bool one_byte)
Definition: regexp.cc:1871
uint32_t value()
Definition: regexp.h:367
void Merge(QuickCheckDetails *other, intptr_t from_index)
Definition: regexp.cc:1892
RecursionCheck(RegExpCompiler *compiler)
Definition: regexp.cc:375
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4584
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4433
AssertionType assertion_type() const
Definition: regexp_ast.h:121
intptr_t length() const
Definition: regexp_ast.h:232
RegExpFlags flags() const
Definition: regexp_ast.h:233
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:3795
virtual void AppendToText(RegExpText *text)
Definition: regexp.cc:212
ZoneGrowableArray< uint16_t > * data() const
Definition: regexp_ast.h:231
bool ignore_case() const
Definition: regexp_ast.h:234
intptr_t index() const
Definition: regexp_ast.h:419
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4489
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4562
static intptr_t EndRegister(intptr_t index)
Definition: regexp_ast.h:338
static intptr_t StartRegister(intptr_t index)
Definition: regexp_ast.h:337
uint16_t standard_type() const
Definition: regexp_ast.h:205
RegExpCharacterClass(ZoneGrowableArray< CharacterRange > *ranges, RegExpFlags flags, CharacterClassFlags character_class_flags=DefaultFlags())
Definition: regexp_ast.h:166
virtual void AppendToText(RegExpText *text)
Definition: regexp.cc:216
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4166
bool contains_split_surrogate() const
Definition: regexp_ast.h:209
ZoneGrowableArray< CharacterRange > * ranges()
Definition: regexp_ast.h:206
bool read_backward()
Definition: regexp.cc:344
intptr_t AllocateRegister()
Definition: regexp.cc:296
static constexpr intptr_t kImplementationOffset
Definition: regexp.cc:329
void AddWork(RegExpNode *node)
Definition: regexp.cc:327
void set_read_backward(bool value)
Definition: regexp.cc:345
static constexpr intptr_t kMaxRecursion
Definition: regexp.cc:336
RegExpCompiler(intptr_t capture_count, bool is_one_byte)
Definition: regexp.cc:390
void set_current_expansion_factor(intptr_t value)
Definition: regexp.cc:349
Zone * zone() const
Definition: regexp.cc:353
RegExpEngine::CompilationResult Assemble(IRRegExpMacroAssembler *assembler, RegExpNode *start, intptr_t capture_count, const String &pattern)
Definition: regexp.cc:405
void DecrementRecursionDepth()
Definition: regexp.cc:339
FrequencyCollator * frequency_collator()
Definition: regexp.cc:346
intptr_t UnicodeLookaroundPositionRegister()
Definition: regexp.cc:307
static constexpr intptr_t kCodeOffset
Definition: regexp.cc:331
bool one_byte() const
Definition: regexp.cc:343
EndNode * accept()
Definition: regexp.cc:334
static constexpr intptr_t kNoRegister
Definition: regexp.cc:355
intptr_t UnicodeLookaroundStackRegister()
Definition: regexp.cc:300
RegExpMacroAssembler * macro_assembler()
Definition: regexp.cc:333
intptr_t recursion_depth()
Definition: regexp.cc:337
void IncrementRecursionDepth()
Definition: regexp.cc:338
void SetRegExpTooBig()
Definition: regexp.cc:341
intptr_t current_expansion_factor()
Definition: regexp.cc:348
static constexpr intptr_t kNumberOfRegistersOffset
Definition: regexp.cc:330
ZoneGrowableArray< RegExpTree * > * alternatives() const
Definition: regexp_ast.h:73
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4203
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4496
static void DotPrint(const char *label, RegExpNode *node, bool ignore_case)
static CompilationResult CompileBytecode(RegExpCompileData *data, const RegExp &regexp, bool is_one_byte, bool sticky, Zone *zone)
Definition: regexp.cc:5414
static CompilationResult CompileIR(RegExpCompileData *input, const ParsedFunction *parsed_function, const ZoneGrowableArray< const ICData * > &ic_data_array, intptr_t osr_id)
Definition: regexp.cc:5298
static RegExpPtr CreateRegExp(Thread *thread, const String &pattern, RegExpFlags flags)
Definition: regexp.cc:5573
RegExpExpansionLimiter(RegExpCompiler *compiler, intptr_t factor)
Definition: regexp.cc:4226
static constexpr intptr_t kMaxExpansionFactor
Definition: regexp.cc:4225
bool NeedsUnicodeCaseEquivalents()
Definition: object.h:12728
bool IsGlobal() const
Definition: object.h:12722
bool IsUnicode() const
Definition: object.h:12725
bool IgnoreCase() const
Definition: object.h:12723
Builder(bool is_positive, RegExpNode *on_success, intptr_t stack_pointer_register, intptr_t position_register, intptr_t capture_register_count=0, intptr_t capture_register_start=0)
Definition: regexp.cc:4501
RegExpNode * ForMatch(RegExpNode *match)
Definition: regexp.cc:4522
RegExpTree * body() const
Definition: regexp_ast.h:368
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4540
bool is_positive() const
Definition: regexp_ast.h:369
void set_global_mode(GlobalMode mode)
virtual void CheckCharacterLT(uint16_t limit, BlockLabel *on_less)=0
virtual void CheckCharacterInRange(uint16_t from, uint16_t to, BlockLabel *on_in_range)=0
virtual void CheckNotBackReferenceIgnoreCase(intptr_t start_reg, bool read_backward, bool unicode, BlockLabel *on_no_match)=0
virtual void CheckNotCharacter(unsigned c, BlockLabel *on_not_equal)=0
virtual void ReadStackPointerFromRegister(intptr_t reg)=0
static constexpr intptr_t kTableSizeBits
static constexpr intptr_t kTableMask
virtual void PopCurrentPosition()=0
virtual void CheckNotAtStart(intptr_t cp_offset, BlockLabel *on_not_at_start)=0
virtual void CheckPosition(intptr_t cp_offset, BlockLabel *on_outside_input)
virtual void CheckAtStart(BlockLabel *on_at_start)=0
static constexpr intptr_t kTableSize
virtual void CheckCharacterAfterAnd(unsigned c, unsigned and_with, BlockLabel *on_equal)=0
virtual void IfRegisterGE(intptr_t reg, intptr_t comparand, BlockLabel *if_ge)=0
virtual void ReadCurrentPositionFromRegister(intptr_t reg)=0
virtual void WriteCurrentPositionToRegister(intptr_t reg, intptr_t cp_offset)=0
virtual void CheckCharacterNotInRange(uint16_t from, uint16_t to, BlockLabel *on_not_in_range)=0
virtual void CheckNotCharacterAfterAnd(unsigned c, unsigned and_with, BlockLabel *on_not_equal)=0
virtual void IfRegisterEqPos(intptr_t reg, BlockLabel *if_eq)=0
virtual void CheckCharacterGT(uint16_t limit, BlockLabel *on_greater)=0
virtual void CheckNotBackReference(intptr_t start_reg, bool read_backward, BlockLabel *on_no_match)=0
virtual bool Succeed()=0
virtual void PushCurrentPosition()=0
virtual void AdvanceCurrentPosition(intptr_t by)=0
virtual void BindBlock(BlockLabel *label)=0
virtual void CheckCharacter(unsigned c, BlockLabel *on_equal)=0
virtual void IfRegisterLT(intptr_t reg, intptr_t comparand, BlockLabel *if_lt)=0
virtual void CheckGreedyLoop(BlockLabel *on_tos_equals_current_position)=0
virtual void GoTo(BlockLabel *to)=0
virtual void CheckBitInTable(const TypedData &table, BlockLabel *on_bit_set)=0
void CheckNotInSurrogatePair(intptr_t cp_offset, BlockLabel *on_failure)
virtual bool CheckSpecialCharacterClass(uint16_t type, BlockLabel *on_no_match)
virtual void Backtrack()=0
virtual void CheckNotCharacterAfterMinusAnd(uint16_t c, uint16_t minus, uint16_t and_with, BlockLabel *on_not_equal)=0
virtual void LoadCurrentCharacter(intptr_t cp_offset, BlockLabel *on_end_of_input, bool check_bounds=true, intptr_t characters=1)=0
virtual void PushBacktrack(BlockLabel *label)=0
virtual void WriteStackPointerToRegister(intptr_t reg)=0
virtual void ClearRegisters(intptr_t reg_from, intptr_t reg_to)=0
static constexpr intptr_t kMaxCPOffset
virtual void CheckPreemption(bool is_backtrack)
virtual void Accept(NodeVisitor *visitor)=0
virtual void Emit(RegExpCompiler *compiler, Trace *trace)=0
NodeInfo * info()
Definition: regexp.h:477
static constexpr intptr_t kNodeIsTooComplexForGreedyLoops
Definition: regexp.h:422
virtual ~RegExpNode()
Definition: regexp.cc:1429
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition: regexp.h:449
LimitResult LimitVersions(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:1431
bool EmitQuickCheck(RegExpCompiler *compiler, Trace *bounds_check_trace, Trace *trace, bool preload_has_checked_bounds, BlockLabel *on_possible_success, QuickCheckDetails *details_return, bool fall_through_on_failure)
Definition: regexp.cc:1621
Zone * zone() const
Definition: regexp.h:483
BoyerMooreLookahead * bm_info(bool not_at_start)
Definition: regexp.h:479
virtual intptr_t GreedyLoopTextLength()
Definition: regexp.h:423
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: regexp.h:439
RegExpNode(Zone *zone)
Definition: regexp.h:386
static constexpr intptr_t kRecursionBudget
Definition: regexp.h:438
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)=0
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)=0
intptr_t max() const
Definition: regexp_ast.h:296
intptr_t min() const
Definition: regexp_ast.h:295
RegExpTree * body() const
Definition: regexp_ast.h:300
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4216
bool is_greedy() const
Definition: regexp_ast.h:299
virtual void AppendToText(RegExpText *text)
Definition: regexp.cc:220
GrowableArray< TextElement > * elements()
Definition: regexp_ast.h:256
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:3803
virtual intptr_t min_match() const =0
virtual Interval CaptureRegisters() const
Definition: regexp_ast.h:51
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)=0
static constexpr intptr_t kInfinity
Definition: regexp_ast.h:39
virtual void AppendToText(RegExpText *text)
Definition: regexp.cc:208
void set_pattern(const String &pattern) const
Definition: object.cc:26612
StringPtr pattern() const
Definition: object.h:12797
void set_function(intptr_t cid, bool sticky, const Function &value) const
Definition: object.cc:26616
static RegExpPtr New(Zone *zone, Heap::Space space=Heap::kNew)
Definition: object.cc:26662
void set_is_complex() const
Definition: object.h:12883
void set_flags(RegExpFlags flags) const
Definition: object.h:12894
RegExpFlags flags() const
Definition: object.h:12891
void set_is_global() const
Definition: object.h:12867
RegExpNode * on_success()
Definition: regexp.h:544
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition: regexp.cc:1931
RegExpNode * FilterSuccessor(intptr_t depth)
Definition: regexp.cc:1939
bool Insert(const Key &key, Locator *locator)
@ kMaxOneCharCodeSymbol
Definition: symbols.h:577
static const String & This()
Definition: symbols.h:692
static TextElement CharClass(RegExpCharacterClass *char_class)
Definition: regexp.cc:229
TextType text_type() const
Definition: regexp.h:246
RegExpCharacterClass * char_class() const
Definition: regexp.h:255
intptr_t length() const
Definition: regexp.cc:233
static TextElement Atom(RegExpAtom *atom)
Definition: regexp.cc:225
intptr_t cp_offset() const
Definition: regexp.h:242
RegExpAtom * atom() const
Definition: regexp.h:250
void set_cp_offset(intptr_t cp_offset)
Definition: regexp.h:243
virtual RegExpNode * FilterOneByte(intptr_t depth)
Definition: regexp.cc:1977
virtual void Emit(RegExpCompiler *compiler, Trace *trace)
Definition: regexp.cc:2514
virtual RegExpNode * GetSuccessorOfOmnivorousTextNode(RegExpCompiler *compiler)
Definition: regexp.cc:2604
virtual intptr_t GreedyLoopTextLength()
Definition: regexp.cc:2599
static TextNode * CreateForCharacterRanges(ZoneGrowableArray< CharacterRange > *ranges, bool read_backward, RegExpNode *on_success, RegExpFlags flags)
Definition: regexp.cc:2480
virtual intptr_t EatsAtLeast(intptr_t still_to_find, intptr_t budget, bool not_at_start)
Definition: regexp.cc:1524
virtual void FillInBMInfo(intptr_t offset, intptr_t budget, BoyerMooreLookahead *bm, bool not_at_start)
Definition: regexp.cc:5203
void MakeCaseIndependent(bool is_one_byte)
Definition: regexp.cc:2581
virtual void GetQuickCheckDetails(QuickCheckDetails *details, RegExpCompiler *compiler, intptr_t characters_filled_in, bool not_at_start)
Definition: regexp.cc:1700
static TextNode * CreateForSurrogatePair(CharacterRange lead, CharacterRange trail, bool read_backward, RegExpNode *on_success, RegExpFlags flags)
Definition: regexp.cc:2491
void CalculateOffsets()
Definition: regexp.cc:5104
Zone * zone() const
Definition: thread_state.h:37
static Thread * Current()
Definition: thread.h:362
static const TokenPosition kMinSource
bool Mentions(intptr_t reg)
Definition: regexp.cc:461
ActionNode::ActionType action_type()
Definition: regexp.h:1215
intptr_t characters_preloaded()
Definition: regexp.h:1305
QuickCheckDetails * quick_check_performed()
Definition: regexp.h:1308
bool GetStoredPosition(intptr_t reg, intptr_t *cp_offset)
Definition: regexp.cc:478
void set_loop_label(BlockLabel *label)
Definition: regexp.h:1323
void InvalidateCurrentCharacter()
Definition: regexp.cc:2558
void set_at_start(TriBool at_start)
Definition: regexp.h:1301
void set_bound_checked_up_to(intptr_t to)
Definition: regexp.h:1327
BlockLabel * loop_label()
Definition: regexp.h:1303
void Flush(RegExpCompiler *compiler, RegExpNode *successor)
Definition: regexp.cc:649
void set_characters_preloaded(intptr_t count)
Definition: regexp.h:1324
bool mentions_reg(intptr_t reg)
Definition: regexp.cc:470
RegExpNode * stop_node()
Definition: regexp.h:1304
void set_stop_node(RegExpNode *node)
Definition: regexp.h:1322
intptr_t cp_offset()
Definition: regexp.h:1283
bool is_trivial()
Definition: regexp.h:1295
BlockLabel * backtrack()
Definition: regexp.h:1302
intptr_t bound_checked_up_to()
Definition: regexp.h:1306
TriBool at_start()
Definition: regexp.h:1300
void set_quick_check_performed(QuickCheckDetails *d)
Definition: regexp.h:1329
DeferredAction * actions()
Definition: regexp.h:1284
intptr_t flush_budget()
Definition: regexp.h:1307
void add_action(DeferredAction *new_action)
Definition: regexp.h:1316
void set_flush_budget(intptr_t to)
Definition: regexp.h:1328
@ FALSE_VALUE
Definition: regexp.h:1206
@ TRUE_VALUE
Definition: regexp.h:1206
void AdvanceCurrentPositionInTrace(intptr_t by, RegExpCompiler *compiler)
Definition: regexp.cc:2562
void set_backtrack(BlockLabel *backtrack)
Definition: regexp.h:1321
static TypePtr ArrayType()
static TypedDataPtr New(intptr_t class_id, intptr_t len, Heap::Space space=Heap::kNew)
Definition: object.cc:25587
ZoneGrowableArray< CharacterRange > * lead_surrogates()
Definition: regexp.h:186
UnicodeRangeSplitter(Zone *zone, ZoneGrowableArray< CharacterRange > *base)
Definition: regexp.cc:3899
ZoneGrowableArray< CharacterRange > * trail_surrogates()
Definition: regexp.h:189
ZoneGrowableArray< CharacterRange > * non_bmp() const
Definition: regexp.h:192
ZoneGrowableArray< CharacterRange > * bmp()
Definition: regexp.h:185
void Call(uint32_t from, ChoiceTable::Entry entry)
Definition: regexp.cc:3939
static constexpr int32_t kLeadSurrogateStart
Definition: unicode.h:159
static constexpr int32_t kMaxCodeUnit
Definition: unicode.h:158
static constexpr int32_t kTrailSurrogateStart
Definition: unicode.h:161
static void Encode(int32_t codepoint, uint16_t *dst)
Definition: unicode.cc:273
static constexpr int32_t kTrailSurrogateEnd
Definition: unicode.h:162
static constexpr int32_t kLeadSurrogateEnd
Definition: unicode.h:160
static constexpr int32_t kMaxCodePoint
Definition: unicode.h:18
static constexpr int32_t kInvalidChar
Definition: unicode.h:19
static constexpr T Maximum(T x, T y)
Definition: utils.h:41
static T Minimum(T x, T y)
Definition: utils.h:36
VisitMarker(NodeInfo *info)
Definition: regexp.cc:1921
intptr_t get(int32_t c, int32_t n, int32_t *result)
Definition: unibrow-inl.h:15
#define DECLARE_VISIT(type, attrs)
#define UNIMPLEMENTED
#define ASSERT(E)
EMSCRIPTEN_KEEPALIVE void empty()
AtkStateType state
FlutterSemanticsFlag flags
glong glong end
uint8_t value
GAsyncResult * result
uint32_t * target
Dart_NativeFunction function
Definition: fuchsia.cc:51
static float max(float r, float g, float b)
Definition: hsl.cpp:49
static float min(float r, float g, float b)
Definition: hsl.cpp:48
size_t length
std::u16string text
const GrXPFactory * Get(SkBlendMode mode)
def match(bench, filt)
Definition: benchmark.py:23
Definition: dart_vm.cc:33
static constexpr intptr_t kWordRangeCount
Definition: regexp.cc:2778
static void SplitSearchSpace(ZoneGrowableArray< uint16_t > *ranges, intptr_t start_index, intptr_t end_index, intptr_t *new_start_index, intptr_t *new_end_index, uint16_t *border)
Definition: regexp.cc:1148
static constexpr intptr_t kMaxLookaheadForBoyerMoore
Definition: regexp.cc:34
static constexpr int32_t kLineTerminatorRanges[]
Definition: regexp.cc:2783
const char *const name
static intptr_t InsertRangeInCanonicalList(ZoneGrowableArray< CharacterRange > *list, intptr_t count, CharacterRange insert)
Definition: regexp.cc:4811
static bool CompareInverseRanges(ZoneGrowableArray< CharacterRange > *ranges, const int32_t *special_class, intptr_t length)
Definition: regexp.cc:3813
static void AddClassNegated(const int32_t *elmv, intptr_t elmc, ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:4611
constexpr int32_t kMinInt32
Definition: globals.h:482
void PrintUtf16(uint16_t c)
void AddBmpCharacters(RegExpCompiler *compiler, ChoiceNode *result, RegExpNode *on_success, UnicodeRangeSplitter *splitter)
Definition: regexp.cc:3959
static void EmitCharClass(RegExpMacroAssembler *macro_assembler, RegExpCharacterClass *cc, bool one_byte, BlockLabel *on_failure, intptr_t cp_offset, bool check_offset, bool preloaded, Zone *zone)
Definition: regexp.cc:1331
static constexpr intptr_t kSurrogateRangeCount
Definition: regexp.cc:2782
static void EmitHat(RegExpCompiler *compiler, RegExpNode *on_success, Trace *trace)
Definition: regexp.cc:2188
static constexpr int32_t kSpaceRanges[]
Definition: regexp.cc:2771
void AddLoneLeadSurrogates(RegExpCompiler *compiler, ChoiceNode *result, RegExpNode *on_success, UnicodeRangeSplitter *splitter)
Definition: regexp.cc:4065
static constexpr bool kRegexpOptimization
Definition: regexp.cc:31
static uint16_t ConvertNonLatin1ToLatin1(uint16_t c)
Definition: regexp.cc:1962
static constexpr int32_t kRangeEndMarker
Definition: regexp.cc:2764
static void UpdateBoundsCheck(intptr_t index, intptr_t *checked_up_to)
Definition: regexp.cc:2357
static bool ShortCutEmitCharacterPair(RegExpMacroAssembler *macro_assembler, bool one_byte, uint16_t c1, uint16_t c2, BlockLabel *on_failure)
Definition: regexp.cc:935
ContainedInLattice AddRange(ContainedInLattice containment, const int32_t *ranges, intptr_t ranges_length, Interval new_range)
Definition: regexp.cc:36
intptr_t word
Definition: globals.h:500
static bool RangeContainsLatin1Equivalents(CharacterRange range)
Definition: regexp.cc:1947
static void EmitDoubleBoundaryTest(RegExpMacroAssembler *masm, uint16_t first, uint16_t last, BlockLabel *fall_through, BlockLabel *in_range, BlockLabel *out_of_range)
Definition: regexp.cc:1040
@ kNoRegister
Definition: constants_arm.h:99
void CreateSpecializedFunction(Thread *thread, Zone *zone, const RegExp &regexp, intptr_t specialization_cid, bool sticky, const Object &owner)
Definition: regexp.cc:5523
void AddNonBmpSurrogatePairs(RegExpCompiler *compiler, ChoiceNode *result, RegExpNode *on_success, UnicodeRangeSplitter *splitter)
Definition: regexp.cc:3969
RegExpNode * MatchAndNegativeLookaroundInReadDirection(RegExpCompiler *compiler, ZoneGrowableArray< CharacterRange > *match, ZoneGrowableArray< CharacterRange > *lookahead, RegExpNode *on_success, bool read_backward, RegExpFlags flags)
Definition: regexp.cc:4048
static constexpr int32_t kWordRanges[]
Definition: regexp.cc:2776
static constexpr int32_t kSurrogateRanges[]
Definition: regexp.cc:2781
static constexpr intptr_t kSpaceRangeCount
Definition: regexp.cc:2775
static bool EmitAtomLetter(Zone *zone, RegExpCompiler *compiler, uint16_t c, BlockLabel *on_failure, intptr_t cp_offset, bool check, bool preloaded)
Definition: regexp.cc:981
static void EmitWordCheck(RegExpMacroAssembler *assembler, BlockLabel *word, BlockLabel *non_word, bool fall_through_on_word)
Definition: regexp.cc:2163
static void USE(T &&)
Definition: globals.h:618
RegExpNode * OptionallyStepBackToLeadSurrogate(RegExpCompiler *compiler, RegExpNode *on_success, RegExpFlags flags)
Definition: regexp.cc:5263
bool EmitCharacterFunction(Zone *zone, RegExpCompiler *compiler, uint16_t c, BlockLabel *on_failure, intptr_t cp_offset, bool check, bool preloaded)
Definition: regexp.cc:971
void AddUnicodeCaseEquivalents(ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:4135
ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b)
Definition: regexp.h:1087
const intptr_t cid
void AddLoneTrailSurrogates(RegExpCompiler *compiler, ChoiceNode *result, RegExpNode *on_success, UnicodeRangeSplitter *splitter)
Definition: regexp.cc:4093
static bool ArrayContains(ZoneGrowableArray< unsigned > *array, unsigned value)
Definition: regexp.cc:4938
static void CutOutRange(RegExpMacroAssembler *masm, ZoneGrowableArray< uint16_t > *ranges, intptr_t start_index, intptr_t end_index, intptr_t cut_index, BlockLabel *even_label, BlockLabel *odd_label)
Definition: regexp.cc:1121
RegExpNode * NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler *compiler, ZoneGrowableArray< CharacterRange > *lookbehind, ZoneGrowableArray< CharacterRange > *match, RegExpNode *on_success, bool read_backward, RegExpFlags flags)
Definition: regexp.cc:4030
static constexpr intptr_t kDigitRangeCount
Definition: regexp.cc:2780
static uint32_t SmearBitsRight(uint32_t v)
Definition: regexp.cc:1589
static void AddClass(const int32_t *elmv, intptr_t elmc, ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:4600
static intptr_t GetCaseIndependentLetters(uint16_t character, bool one_byte_subject, int32_t *letters)
Definition: regexp.cc:861
static void GenerateBranches(RegExpMacroAssembler *masm, ZoneGrowableArray< uint16_t > *ranges, intptr_t start_index, intptr_t end_index, uint16_t min_char, uint16_t max_char, BlockLabel *fall_through, BlockLabel *even_label, BlockLabel *odd_label)
Definition: regexp.cc:1217
static bool EmitAtomNonLetter(Zone *zone, RegExpCompiler *compiler, uint16_t c, BlockLabel *on_failure, intptr_t cp_offset, bool check, bool preloaded)
Definition: regexp.cc:902
static int8_t data[kExtLength]
ContainedInLattice
Definition: regexp.h:1080
@ kLatticeOut
Definition: regexp.h:1083
@ kLatticeUnknown
Definition: regexp.h:1084
@ kLatticeIn
Definition: regexp.h:1082
static constexpr intptr_t kLineTerminatorRangeCount
Definition: regexp.cc:2785
RegExpNode * UnanchoredAdvance(RegExpCompiler *compiler, RegExpNode *on_success)
Definition: regexp.cc:4121
static constexpr int32_t kDigitRanges[]
Definition: regexp.cc:2779
static void MoveRanges(ZoneGrowableArray< CharacterRange > *list, intptr_t from, intptr_t to, intptr_t count)
Definition: regexp.cc:4795
static void EmitUseLookupTable(RegExpMacroAssembler *masm, ZoneGrowableArray< uint16_t > *ranges, intptr_t start_index, intptr_t end_index, uint16_t min_char, BlockLabel *fall_through, BlockLabel *even_label, BlockLabel *odd_label)
Definition: regexp.cc:1064
static void EmitBoundaryTest(RegExpMacroAssembler *masm, uint16_t border, BlockLabel *fall_through, BlockLabel *above_or_equal, BlockLabel *below)
Definition: regexp.cc:1027
static RegExpEngine::CompilationResult IrregexpRegExpTooBig()
Definition: regexp.cc:384
static bool RangesContainLatin1Equivalents(ZoneGrowableArray< CharacterRange > *ranges)
Definition: regexp.cc:1953
COMPILE_ASSERT(kUnreachableReference==WeakTable::kNoValue)
static bool EmitSimpleCharacter(Zone *zone, RegExpCompiler *compiler, uint16_t c, BlockLabel *on_failure, intptr_t cp_offset, bool check, bool preloaded)
Definition: regexp.cc:883
static bool DeterminedAlready(QuickCheckDetails *quick_check, intptr_t offset)
Definition: regexp.cc:2351
static bool CompareRanges(ZoneGrowableArray< CharacterRange > *ranges, const int32_t *special_class, intptr_t length)
Definition: regexp.cc:3843
constexpr SkISize kSize
Definition: mock_canvas.cc:17
DEF_SWITCHES_START aot vmservice shared library Name of the *so containing AOT compiled Dart assets for launching the service isolate vm snapshot The VM snapshot data that will be memory mapped as read only SnapshotAssetPath must be present isolate snapshot The isolate snapshot data that will be memory mapped as read only SnapshotAssetPath must be present cache dir Path to the cache directory This is different from the persistent_cache_path in embedder which is used for Skia shader cache icu native lib Path to the library file that exports the ICU data vm service The hostname IP address on which the Dart VM Service should be served If not defaults to or::depending on whether ipv6 is specified vm service A custom Dart VM Service port The default is to pick a randomly available open port disable vm Disable the Dart VM Service The Dart VM Service is never available in release mode disable vm service Disable mDNS Dart VM Service publication Bind to the IPv6 localhost address for the Dart VM Service Ignored if vm service host is set endless trace buffer
Definition: switches.h:126
it will be possible to load the file into Perfetto s trace viewer disable asset Prevents usage of any non test fonts unless they were explicitly Loaded via prefetched default font Indicates whether the embedding started a prefetch of the default font manager before creating the engine run In non interactive mode
Definition: switches.h:228
DEF_SWITCHES_START aot vmservice shared library Name of the *so containing AOT compiled Dart assets for launching the service isolate vm snapshot The VM snapshot data that will be memory mapped as read only SnapshotAssetPath must be present isolate snapshot The isolate snapshot data that will be memory mapped as read only SnapshotAssetPath must be present cache dir Path to the cache directory This is different from the persistent_cache_path in embedder which is used for Skia shader cache icu native lib Path to the library file that exports the ICU data vm service The hostname IP address on which the Dart VM Service should be served If not set
Definition: switches.h:76
compiler
Definition: malisc.py:17
SI auto map(std::index_sequence< I... >, Fn &&fn, const Args &... args) -> skvx::Vec< sizeof...(I), decltype(fn(args[0]...))>
Definition: SkVx.h:680
#define FALL_THROUGH
Definition: globals.h:15
#define Pd
Definition: globals.h:408
#define DISALLOW_ALLOCATION()
Definition: globals.h:604
#define DISALLOW_COPY_AND_ASSIGN(TypeName)
Definition: globals.h:581
#define FOR_EACH_NODE_TYPE(VISIT)
Definition: regexp.h:210
SeparatedVector2 offset
QuickCheckDetails quick_check_details
Definition: regexp.cc:2730
bool been_analyzed
Definition: regexp.h:324
bool being_analyzed
Definition: regexp.h:323
static constexpr intptr_t kEatsAtLeastNotYetInitialized
Definition: regexp.h:1374
intptr_t preload_characters_
Definition: regexp.h:1377
intptr_t eats_at_least_
Definition: regexp.h:1378
static constexpr intptr_t kMaxWidth
Definition: unibrow.h:56
#define ARRAY_SIZE(array)
Definition: globals.h:72
#define OZ
Definition: regexp.cc:3793
#define Z
Definition: regexp.cc:26
#define DEFINE_ACCEPT(Type)
Definition: regexp.cc:828
WORD ATOM
Definition: windows_types.h:43