d2/d89/vm_2unicode_8cc_source.html

// Copyright (c) 2012, the Dart project authors.  Please see the AUTHORS file

// for details. All rights reserved. Use of this source code is governed by a

// BSD-style license that can be found in the LICENSE file.


#include "platform/unicode.h"


#include "vm/allocation.h"

#include "vm/globals.h"

#include "vm/object.h"


namespace dart {


// A constant mask that can be 'and'ed with a word of data to determine if it

// is all ASCII (with no Latin1 characters).

#if defined(ARCH_IS_64_BIT)

static constexpr uintptr_t kAsciiWordMask = DART_UINT64_C(0x8080808080808080);

#else

static constexpr uintptr_t kAsciiWordMask = 0x80808080u;

#endif


intptr_t Utf8::Length(const String& str) {

  if (str.IsOneByteString()) {

    // For 1-byte strings, all code points < 0x80 have single-byte UTF-8

    // encodings and all >= 0x80 have two-byte encodings.  To get the length,

    // start with the number of code points and add the number of high bits in

    // the bytes.

    uintptr_t char_length = str.Length();

    uintptr_t length = char_length;

    NoSafepointScope no_safepoint;

    const uintptr_t* data =

        reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(str));

    uintptr_t i;

    for (i = sizeof(uintptr_t); i <= char_length; i += sizeof(uintptr_t)) {

      uintptr_t chunk = *data++;

      chunk &= kAsciiWordMask;

      if (chunk != 0) {

// Shuffle the bits until we have a count of bits in the low nibble.

#if defined(ARCH_IS_64_BIT)

        chunk += chunk >> 32;

#endif

        chunk += chunk >> 16;

        chunk += chunk >> 8;

        length += (chunk >> 7) & 0xf;

      }

    }

    // Take care of the tail of the string, the last length % wordsize chars.

    i -= sizeof(uintptr_t);

    for (; i < char_length; i++) {

      if (str.CharAt(i) > kMaxOneByteChar) length++;

    }

    return length;

  }


  // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8

  // encodings.

  intptr_t length = 0;

  String::CodePointIterator it(str);

  while (it.Next()) {

    int32_t ch = it.Current();

    length += Utf8::Length(ch);

  }

  return length;

}


intptr_t Utf8::Encode(const String& src, char* dst, intptr_t len) {

  uintptr_t array_len = len;

  intptr_t pos = 0;

  ASSERT(static_cast<intptr_t>(array_len) >= Length(src));

  if (src.IsOneByteString()) {

    // For 1-byte strings, all code points < 0x80 have single-byte UTF-8

    // encodings and all >= 0x80 have two-byte encodings.

    NoSafepointScope scope;

    const uintptr_t* data =

        reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(src));

    uintptr_t char_length = src.Length();

    uintptr_t pos = 0;

    ASSERT(kMaxOneByteChar + 1 == 0x80);

    for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) {

      // Read the input one word at a time and just write it verbatim if it is

      // plain ASCII, as determined by the mask.

      if (i + sizeof(uintptr_t) <= char_length &&

          (*data & kAsciiWordMask) == 0 &&

          pos + sizeof(uintptr_t) <= array_len) {

        StoreUnaligned(reinterpret_cast<uintptr_t*>(dst + pos), *data);

        pos += sizeof(uintptr_t);

      } else {

        // Process up to one word of input that contains non-ASCII Latin1

        // characters.

        const uint8_t* p = reinterpret_cast<const uint8_t*>(data);

        const uint8_t* limit =

            Utils::Minimum(p + sizeof(uintptr_t), p + (char_length - i));

        for (; p < limit; p++) {

          uint8_t c = *p;

          // These calls to Length and Encode get inlined and the cases for 3

          // and 4 byte sequences are removed.

          intptr_t bytes = Length(c);

          if (pos + bytes > array_len) {

            return pos;

          }

          Encode(c, reinterpret_cast<char*>(dst) + pos);

          pos += bytes;

        }

      }

      data++;

    }

  } else {

    // For two-byte strings, which can contain 3 and 4-byte UTF-8 encodings,

    // which can result in surrogate pairs, use the more general code.

    String::CodePointIterator it(src);

    while (it.Next()) {

      int32_t ch = it.Current();

      ASSERT(!Utf::IsOutOfRange(ch));

      if (Utf16::IsSurrogate(ch)) {

        // Encode unpaired surrogates as replacement characters to ensure the

        // output is valid UTF-8. Encoded size is the same (3), so the computed

        // length is still valid.

        ch = Utf::kReplacementChar;

      }

      intptr_t num_bytes = Utf8::Length(ch);

      if (pos + num_bytes > len) {

        break;

      }

      Utf8::Encode(ch, &dst[pos]);

      pos += num_bytes;

    }

  }

  return pos;

}


}  // namespace dart

pos
SkPoint pos
Definition: ImageShaderTest.cpp:27

dart::NoSafepointScope
Definition: thread.h:1557

dart::String::CodePointIterator
Definition: object.h:10181

dart::String::CodePointIterator::Current
int32_t Current() const
Definition: object.h:10194

dart::String::CodePointIterator::Next
bool Next()
Definition: object.cc:24318

dart::String
Definition: object.h:10158

dart::String::IsOneByteString
bool IsOneByteString() const
Definition: object.h:10311

dart::String::Length
intptr_t Length() const
Definition: object.h:10210

dart::String::CharAt
uint16_t CharAt(intptr_t index) const
Definition: object.h:10259

dart::Utf16::IsSurrogate
static bool IsSurrogate(uint32_t ch)
Definition: unicode.h:123

dart::Utf8::Length
static intptr_t Length(int32_t ch)
Definition: unicode.cc:98

dart::Utf8::Encode
static intptr_t Encode(int32_t ch, char *dst)
Definition: unicode.cc:110

dart::Utf8::kMaxOneByteChar
static constexpr int32_t kMaxOneByteChar
Definition: unicode.h:86

dart::Utf::kReplacementChar
static constexpr int32_t kReplacementChar
Definition: unicode.h:21

dart::Utf::IsOutOfRange
static bool IsOutOfRange(int32_t code_point)
Definition: unicode.h:36

dart::Utils::Minimum
static T Minimum(T x, T y)
Definition: utils.h:36

ASSERT
#define ASSERT(E)
Definition: entrypoints_verification_test.cc:25

i
int i
Definition: fl_socket_accessible.cc:18

length
size_t length
Definition: key_event_handler.cc:41

dart_profiler_symbols.p
p
Definition: dart_profiler_symbols.py:55

dart
Definition: dart_vm.cc:33

dart::StoreUnaligned
static void StoreUnaligned(T *ptr, T value)
Definition: unaligned.h:22

dart::data
static int8_t data[kExtLength]
Definition: dart_api_impl_test.cc:2256

dart::kAsciiWordMask
static constexpr uintptr_t kAsciiWordMask
Definition: unicode.cc:18

gn.cp.dst
dst
Definition: cp.py:12

gn.find_headers.len
len
Definition: find_headers.py:30

mskp_parser.src
src
Definition: mskp_parser.py:22

object.h

DART_UINT64_C
#define DART_UINT64_C(x)
Definition: globals.h:434

allocation.h

unicode.h

globals.h