Flutter Engine
The Flutter Engine
Public Types | Static Public Member Functions | Static Public Attributes | List of all members
dart::Utf8 Class Reference

#include <unicode.h>

Inheritance diagram for dart::Utf8:
dart::AllStatic

Public Types

enum  Type { kLatin1 = 0 , kBMP , kSupplementary }
 

Static Public Member Functions

static intptr_t CodeUnitCount (const uint8_t *utf8_array, intptr_t array_len, Type *type)
 
static bool IsValid (const uint8_t *utf8_array, intptr_t array_len)
 
static intptr_t Length (int32_t ch)
 
static intptr_t Length (const String &str)
 
static intptr_t Encode (int32_t ch, char *dst)
 
static intptr_t Encode (const String &src, char *dst, intptr_t len)
 
static intptr_t Decode (const uint8_t *utf8_array, intptr_t array_len, int32_t *ch)
 
static bool DecodeToLatin1 (const uint8_t *utf8_array, intptr_t array_len, uint8_t *dst, intptr_t len)
 
static bool DecodeToUTF16 (const uint8_t *utf8_array, intptr_t array_len, uint16_t *dst, intptr_t len)
 
static bool DecodeToUTF32 (const uint8_t *utf8_array, intptr_t array_len, int32_t *dst, intptr_t len)
 
static intptr_t ReportInvalidByte (const uint8_t *utf8_array, intptr_t array_len, intptr_t len)
 
static bool DecodeCStringToUTF32 (const char *str, int32_t *dst, intptr_t len)
 

Static Public Attributes

static constexpr int32_t kMaxOneByteChar = 0x7F
 
static constexpr int32_t kMaxTwoByteChar = 0x7FF
 
static constexpr int32_t kMaxThreeByteChar = 0xFFFF
 
static constexpr int32_t kMaxFourByteChar = Utf::kMaxCodePoint
 

Detailed Description

Definition at line 41 of file unicode.h.

Member Enumeration Documentation

◆ Type

Enumerator
kLatin1 
kBMP 
kSupplementary 

Definition at line 43 of file unicode.h.

43 {
44 kLatin1 = 0, // Latin-1 code point [U+0000, U+00FF].
45 kBMP, // Basic Multilingual Plane code point [U+0000, U+FFFF].
46 kSupplementary, // Supplementary code point [U+010000, U+10FFFF].
47 };
@ kSupplementary
Definition: unicode.h:46
@ kLatin1
Definition: unicode.h:44

Member Function Documentation

◆ CodeUnitCount()

intptr_t dart::Utf8::CodeUnitCount ( const uint8_t *  utf8_array,
intptr_t  array_len,
Type type 
)
static

Definition at line 46 of file unicode.cc.

48 {
49 intptr_t len = 0;
50 Type char_type = kLatin1;
51 for (intptr_t i = 0; i < array_len; i++) {
52 uint8_t code_unit = utf8_array[i];
53 if (!IsTrailByte(code_unit)) {
54 ++len;
55 if (!IsLatin1SequenceStart(code_unit)) { // > U+00FF
56 if (IsSupplementarySequenceStart(code_unit)) { // >= U+10000
57 char_type = kSupplementary;
58 ++len;
59 } else if (char_type == kLatin1) {
60 char_type = kBMP;
61 }
62 }
63 }
64 }
65 *type = char_type;
66 return len;
67}
GLenum type

◆ Decode()

intptr_t dart::Utf8::Decode ( const uint8_t *  utf8_array,
intptr_t  array_len,
int32_t *  ch 
)
static

Definition at line 135 of file unicode.cc.

137 {
138 uint32_t ch = utf8_array[0] & 0xFF;
139 intptr_t i = 1;
140 if (ch >= 0x80) {
141 intptr_t num_trail_bytes = kTrailBytes[ch];
142 bool is_malformed = false;
143 for (; i < num_trail_bytes; ++i) {
144 if (i < array_len) {
145 uint8_t code_unit = utf8_array[i];
146 is_malformed |= !IsTrailByte(code_unit);
147 ch = (ch << 6) + code_unit;
148 } else {
149 *dst = -1;
150 return 0;
151 }
152 }
153 ch -= kMagicBits[num_trail_bytes];
154 if (!((is_malformed == false) && (i == num_trail_bytes) &&
155 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, i))) {
156 *dst = -1;
157 return 0;
158 }
159 }
160 *dst = ch;
161 return i;
162}
static bool IsOutOfRange(int32_t code_point)
Definition: unicode.h:36
dst
Definition: cp.py:12

◆ DecodeCStringToUTF32()

bool dart::Utf8::DecodeCStringToUTF32 ( const char *  str,
int32_t *  dst,
intptr_t  len 
)
static

Definition at line 266 of file unicode.cc.

266 {
267 ASSERT(str != nullptr);
268 intptr_t array_len = strlen(str);
269 const uint8_t* utf8_array = reinterpret_cast<const uint8_t*>(str);
270 return Utf8::DecodeToUTF32(utf8_array, array_len, dst, len);
271}
static bool DecodeToUTF32(const uint8_t *utf8_array, intptr_t array_len, int32_t *dst, intptr_t len)
Definition: unicode.cc:245
#define ASSERT(E)

◆ DecodeToLatin1()

bool dart::Utf8::DecodeToLatin1 ( const uint8_t *  utf8_array,
intptr_t  array_len,
uint8_t *  dst,
intptr_t  len 
)
static

Definition at line 194 of file unicode.cc.

197 {
198 intptr_t i = 0;
199 intptr_t j = 0;
200 intptr_t num_bytes;
201 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
202 int32_t ch;
203 ASSERT(IsLatin1SequenceStart(utf8_array[i]));
204 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
205 if (ch == -1) {
206 return false; // Invalid input.
207 }
209 dst[j] = ch;
210 }
211 if ((i < array_len) && (j == len)) {
212 return false; // Output overflow.
213 }
214 return true; // Success.
215}
static intptr_t Decode(const uint8_t *utf8_array, intptr_t array_len, int32_t *ch)
Definition: unicode.cc:135
static bool IsLatin1(int32_t code_point)
Definition: unicode.h:23

◆ DecodeToUTF16()

bool dart::Utf8::DecodeToUTF16 ( const uint8_t *  utf8_array,
intptr_t  array_len,
uint16_t *  dst,
intptr_t  len 
)
static

Definition at line 217 of file unicode.cc.

220 {
221 intptr_t i = 0;
222 intptr_t j = 0;
223 intptr_t num_bytes;
224 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
225 int32_t ch;
226 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);
227 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
228 if (ch == -1) {
229 return false; // Invalid input.
230 }
231 if (is_supplementary) {
232 if (j == (len - 1)) return false; // Output overflow.
233 Utf16::Encode(ch, &dst[j]);
234 j = j + 1;
235 } else {
236 dst[j] = ch;
237 }
238 }
239 if ((i < array_len) && (j == len)) {
240 return false; // Output overflow.
241 }
242 return true; // Success.
243}
static void Encode(int32_t codepoint, uint16_t *dst)
Definition: unicode.cc:273

◆ DecodeToUTF32()

bool dart::Utf8::DecodeToUTF32 ( const uint8_t *  utf8_array,
intptr_t  array_len,
int32_t *  dst,
intptr_t  len 
)
static

Definition at line 245 of file unicode.cc.

248 {
249 intptr_t i = 0;
250 intptr_t j = 0;
251 intptr_t num_bytes;
252 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
253 int32_t ch;
254 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
255 if (ch == -1) {
256 return false; // Invalid input.
257 }
258 dst[j] = ch;
259 }
260 if ((i < array_len) && (j == len)) {
261 return false; // Output overflow.
262 }
263 return true; // Success.
264}

◆ Encode() [1/2]

intptr_t dart::Utf8::Encode ( const String src,
char *  dst,
intptr_t  len 
)
static

Definition at line 65 of file unicode.cc.

65 {
66 uintptr_t array_len = len;
67 intptr_t pos = 0;
68 ASSERT(static_cast<intptr_t>(array_len) >= Length(src));
69 if (src.IsOneByteString()) {
70 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8
71 // encodings and all >= 0x80 have two-byte encodings.
72 NoSafepointScope scope;
73 const uintptr_t* data =
74 reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(src));
75 uintptr_t char_length = src.Length();
76 uintptr_t pos = 0;
77 ASSERT(kMaxOneByteChar + 1 == 0x80);
78 for (uintptr_t i = 0; i < char_length; i += sizeof(uintptr_t)) {
79 // Read the input one word at a time and just write it verbatim if it is
80 // plain ASCII, as determined by the mask.
81 if (i + sizeof(uintptr_t) <= char_length &&
82 (*data & kAsciiWordMask) == 0 &&
83 pos + sizeof(uintptr_t) <= array_len) {
84 StoreUnaligned(reinterpret_cast<uintptr_t*>(dst + pos), *data);
85 pos += sizeof(uintptr_t);
86 } else {
87 // Process up to one word of input that contains non-ASCII Latin1
88 // characters.
89 const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
90 const uint8_t* limit =
91 Utils::Minimum(p + sizeof(uintptr_t), p + (char_length - i));
92 for (; p < limit; p++) {
93 uint8_t c = *p;
94 // These calls to Length and Encode get inlined and the cases for 3
95 // and 4 byte sequences are removed.
96 intptr_t bytes = Length(c);
97 if (pos + bytes > array_len) {
98 return pos;
99 }
100 Encode(c, reinterpret_cast<char*>(dst) + pos);
101 pos += bytes;
102 }
103 }
104 data++;
105 }
106 } else {
107 // For two-byte strings, which can contain 3 and 4-byte UTF-8 encodings,
108 // which can result in surrogate pairs, use the more general code.
109 String::CodePointIterator it(src);
110 while (it.Next()) {
111 int32_t ch = it.Current();
113 if (Utf16::IsSurrogate(ch)) {
114 // Encode unpaired surrogates as replacement characters to ensure the
115 // output is valid UTF-8. Encoded size is the same (3), so the computed
116 // length is still valid.
118 }
119 intptr_t num_bytes = Utf8::Length(ch);
120 if (pos + num_bytes > len) {
121 break;
122 }
123 Utf8::Encode(ch, &dst[pos]);
124 pos += num_bytes;
125 }
126 }
127 return pos;
128}
SkPoint pos
static bool IsSurrogate(uint32_t ch)
Definition: unicode.h:123
static intptr_t Length(int32_t ch)
Definition: unicode.cc:98
static intptr_t Encode(int32_t ch, char *dst)
Definition: unicode.cc:110
static constexpr int32_t kMaxOneByteChar
Definition: unicode.h:86
static constexpr int32_t kReplacementChar
Definition: unicode.h:21
static T Minimum(T x, T y)
Definition: utils.h:36
static void StoreUnaligned(T *ptr, T value)
Definition: unaligned.h:22
static int8_t data[kExtLength]
static constexpr uintptr_t kAsciiWordMask
Definition: unicode.cc:18

◆ Encode() [2/2]

intptr_t dart::Utf8::Encode ( int32_t  ch,
char *  dst 
)
static

Definition at line 110 of file unicode.cc.

110 {
111 constexpr int kMask = ~(1 << 6);
112 if (ch <= kMaxOneByteChar) {
113 dst[0] = ch;
114 return 1;
115 }
116 if (ch <= kMaxTwoByteChar) {
117 dst[0] = 0xC0 | (ch >> 6);
118 dst[1] = 0x80 | (ch & kMask);
119 return 2;
120 }
121 if (ch <= kMaxThreeByteChar) {
122 dst[0] = 0xE0 | (ch >> 12);
123 dst[1] = 0x80 | ((ch >> 6) & kMask);
124 dst[2] = 0x80 | (ch & kMask);
125 return 3;
126 }
128 dst[0] = 0xF0 | (ch >> 18);
129 dst[1] = 0x80 | ((ch >> 12) & kMask);
130 dst[2] = 0x80 | ((ch >> 6) & kMask);
131 dst[3] = 0x80 | (ch & kMask);
132 return 4;
133}
static constexpr int32_t kMaxTwoByteChar
Definition: unicode.h:87
static constexpr int32_t kMaxFourByteChar
Definition: unicode.h:89
static constexpr int32_t kMaxThreeByteChar
Definition: unicode.h:88
@ kMask
Definition: SkGlyph.h:315

◆ IsValid()

bool dart::Utf8::IsValid ( const uint8_t *  utf8_array,
intptr_t  array_len 
)
static

Definition at line 70 of file unicode.cc.

70 {
71 intptr_t i = 0;
72 while (i < array_len) {
73 uint32_t ch = utf8_array[i] & 0xFF;
74 intptr_t j = 1;
75 if (ch >= 0x80) {
76 int8_t num_trail_bytes = kTrailBytes[ch];
77 bool is_malformed = false;
78 for (; j < num_trail_bytes; ++j) {
79 if ((i + j) < array_len) {
80 uint8_t code_unit = utf8_array[i + j];
81 is_malformed |= !IsTrailByte(code_unit);
82 ch = (ch << 6) + code_unit;
83 } else {
84 return false;
85 }
86 }
87 ch -= kMagicBits[num_trail_bytes];
88 if (!((is_malformed == false) && (j == num_trail_bytes) &&
89 !Utf::IsOutOfRange(ch) && !IsNonShortestForm(ch, j))) {
90 return false;
91 }
92 }
93 i += j;
94 }
95 return true;
96}

◆ Length() [1/2]

intptr_t dart::Utf8::Length ( const String str)
static

Definition at line 21 of file unicode.cc.

21 {
22 if (str.IsOneByteString()) {
23 // For 1-byte strings, all code points < 0x80 have single-byte UTF-8
24 // encodings and all >= 0x80 have two-byte encodings. To get the length,
25 // start with the number of code points and add the number of high bits in
26 // the bytes.
27 uintptr_t char_length = str.Length();
28 uintptr_t length = char_length;
29 NoSafepointScope no_safepoint;
30 const uintptr_t* data =
31 reinterpret_cast<const uintptr_t*>(OneByteString::DataStart(str));
32 uintptr_t i;
33 for (i = sizeof(uintptr_t); i <= char_length; i += sizeof(uintptr_t)) {
34 uintptr_t chunk = *data++;
35 chunk &= kAsciiWordMask;
36 if (chunk != 0) {
37// Shuffle the bits until we have a count of bits in the low nibble.
38#if defined(ARCH_IS_64_BIT)
39 chunk += chunk >> 32;
40#endif
41 chunk += chunk >> 16;
42 chunk += chunk >> 8;
43 length += (chunk >> 7) & 0xf;
44 }
45 }
46 // Take care of the tail of the string, the last length % wordsize chars.
47 i -= sizeof(uintptr_t);
48 for (; i < char_length; i++) {
49 if (str.CharAt(i) > kMaxOneByteChar) length++;
50 }
51 return length;
52 }
53
54 // Slow case for 2-byte strings that handles surrogate pairs and longer UTF-8
55 // encodings.
56 intptr_t length = 0;
57 String::CodePointIterator it(str);
58 while (it.Next()) {
59 int32_t ch = it.Current();
60 length += Utf8::Length(ch);
61 }
62 return length;
63}
size_t length

◆ Length() [2/2]

intptr_t dart::Utf8::Length ( int32_t  ch)
static

Definition at line 98 of file unicode.cc.

98 {
99 if (ch <= kMaxOneByteChar) {
100 return 1;
101 } else if (ch <= kMaxTwoByteChar) {
102 return 2;
103 } else if (ch <= kMaxThreeByteChar) {
104 return 3;
105 }
107 return 4;
108}

◆ ReportInvalidByte()

intptr_t dart::Utf8::ReportInvalidByte ( const uint8_t *  utf8_array,
intptr_t  array_len,
intptr_t  len 
)
static

Definition at line 163 of file unicode.cc.

165 {
166 intptr_t i = 0;
167 intptr_t j = 0;
168 intptr_t num_bytes;
169 for (; (i < array_len) && (j < len); i += num_bytes, ++j) {
170 int32_t ch;
171 bool is_supplementary = IsSupplementarySequenceStart(utf8_array[i]);
172 num_bytes = Utf8::Decode(&utf8_array[i], (array_len - i), &ch);
173 if (ch == -1) {
174 break; // Invalid input.
175 }
176 if (is_supplementary) {
177 j = j + 1;
178 }
179 }
180#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
181 // Remain silent while libFuzzer is active, since
182 // the output only slows down the in-process fuzzing.
183#else
184 Syslog::PrintErr("Invalid UTF8 sequence encountered, ");
185 for (intptr_t idx = 0; idx < 10 && (i + idx) < array_len; idx++) {
186 Syslog::PrintErr("(Error Code: %X + idx: %" Pd " )", utf8_array[idx + i],
187 (idx + i));
188 }
189 Syslog::PrintErr("\n");
190#endif
191 return i;
192}
static void PrintErr(const char *format,...) PRINTF_ATTRIBUTE(1
#define Pd
Definition: globals.h:408

Member Data Documentation

◆ kMaxFourByteChar

constexpr int32_t dart::Utf8::kMaxFourByteChar = Utf::kMaxCodePoint
staticconstexpr

Definition at line 89 of file unicode.h.

◆ kMaxOneByteChar

constexpr int32_t dart::Utf8::kMaxOneByteChar = 0x7F
staticconstexpr

Definition at line 86 of file unicode.h.

◆ kMaxThreeByteChar

constexpr int32_t dart::Utf8::kMaxThreeByteChar = 0xFFFF
staticconstexpr

Definition at line 88 of file unicode.h.

◆ kMaxTwoByteChar

constexpr int32_t dart::Utf8::kMaxTwoByteChar = 0x7FF
staticconstexpr

Definition at line 87 of file unicode.h.


The documentation for this class was generated from the following files: