14const int8_t Utf8::kTrailBytes[256] = {
15 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
16 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
17 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
24 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
28 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
29 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
30 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
34const uint32_t Utf8::kMagicBits[7] = {0,
35 0x00000000, 0x00003080, 0x000E2080,
36 0x03C82080, 0xFA082080, 0x82082080};
39const uint32_t Utf8::kOverlongMinimum[7] = {0,
41 0x10000, 0xFFFFFFFF, 0xFFFFFFFF};
51 for (intptr_t
i = 0;
i < array_len;
i++) {
52 uint8_t code_unit = utf8_array[
i];
53 if (!IsTrailByte(code_unit)) {
55 if (!IsLatin1SequenceStart(code_unit)) {
56 if (IsSupplementarySequenceStart(code_unit)) {
59 }
else if (char_type ==
kLatin1) {
72 while (
i < array_len) {
73 uint32_t ch = utf8_array[
i] & 0xFF;
76 int8_t num_trail_bytes = kTrailBytes[ch];
77 bool is_malformed =
false;
78 for (; j < num_trail_bytes; ++j) {
79 if ((
i + j) < array_len) {
80 uint8_t code_unit = utf8_array[
i + j];
81 is_malformed |= !IsTrailByte(code_unit);
82 ch = (ch << 6) + code_unit;
87 ch -= kMagicBits[num_trail_bytes];
88 if (!((is_malformed ==
false) && (j == num_trail_bytes) &&
111 constexpr int kMask = ~(1 << 6);
117 dst[0] = 0xC0 | (ch >> 6);
118 dst[1] = 0x80 | (ch & kMask);
122 dst[0] = 0xE0 | (ch >> 12);
123 dst[1] = 0x80 | ((ch >> 6) & kMask);
124 dst[2] = 0x80 | (ch & kMask);
128 dst[0] = 0xF0 | (ch >> 18);
129 dst[1] = 0x80 | ((ch >> 12) & kMask);
130 dst[2] = 0x80 | ((ch >> 6) & kMask);
131 dst[3] = 0x80 | (ch & kMask);
138 uint32_t ch = utf8_array[0] & 0xFF;
141 intptr_t num_trail_bytes = kTrailBytes[ch];
142 bool is_malformed =
false;
143 for (;
i < num_trail_bytes; ++
i) {
145 uint8_t code_unit = utf8_array[
i];
146 is_malformed |= !IsTrailByte(code_unit);
147 ch = (ch << 6) + code_unit;
153 ch -= kMagicBits[num_trail_bytes];
154 if (!((is_malformed ==
false) && (
i == num_trail_bytes) &&
169 for (; (
i < array_len) && (j <
len);
i += num_bytes, ++j) {
171 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[
i]);
176 if (is_supplementary) {
180#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
185 for (intptr_t idx = 0; idx < 10 && (
i + idx) < array_len; idx++) {
201 for (; (
i < array_len) && (j <
len);
i += num_bytes, ++j) {
203 ASSERT(IsLatin1SequenceStart(utf8_array[
i]));
211 if ((
i < array_len) && (j ==
len)) {
224 for (; (
i < array_len) && (j <
len);
i += num_bytes, ++j) {
226 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[
i]);
231 if (is_supplementary) {
232 if (j == (
len - 1))
return false;
239 if ((
i < array_len) && (j ==
len)) {
252 for (; (
i < array_len) && (j <
len);
i += num_bytes, ++j) {
260 if ((
i < array_len) && (j ==
len)) {
268 intptr_t array_len = strlen(str);
269 const uint8_t* utf8_array =
reinterpret_cast<const uint8_t*
>(str);
276 dst[0] = (Utf16::kLeadSurrogateOffset + (codepoint >> 10));
277 dst[1] = (0xDC00 + (codepoint & 0x3FF));
static void PrintErr(const char *format,...) PRINTF_ATTRIBUTE(1
static constexpr int32_t kMaxCodeUnit
static void Encode(int32_t codepoint, uint16_t *dst)
static bool DecodeCStringToUTF32(const char *str, int32_t *dst, intptr_t len)
static intptr_t Length(int32_t ch)
static bool DecodeToUTF32(const uint8_t *utf8_array, intptr_t array_len, int32_t *dst, intptr_t len)
static constexpr int32_t kMaxTwoByteChar
static constexpr int32_t kMaxFourByteChar
static intptr_t CodeUnitCount(const uint8_t *utf8_array, intptr_t array_len, Type *type)
static intptr_t ReportInvalidByte(const uint8_t *utf8_array, intptr_t array_len, intptr_t len)
static constexpr int32_t kMaxThreeByteChar
static bool IsValid(const uint8_t *utf8_array, intptr_t array_len)
static bool DecodeToUTF16(const uint8_t *utf8_array, intptr_t array_len, uint16_t *dst, intptr_t len)
static intptr_t Decode(const uint8_t *utf8_array, intptr_t array_len, int32_t *ch)
static bool DecodeToLatin1(const uint8_t *utf8_array, intptr_t array_len, uint8_t *dst, intptr_t len)
static intptr_t Encode(int32_t ch, char *dst)
static constexpr int32_t kMaxOneByteChar
static bool IsOutOfRange(int32_t code_point)
static bool IsLatin1(int32_t code_point)