Flutter Engine
Hyphenator.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * An implementation of Liang's hyphenation algorithm.
19  */
20 
21 #ifndef U_USING_ICU_NAMESPACE
22 #define U_USING_ICU_NAMESPACE 0
23 #endif // U_USING_ICU_NAMESPACE
24 
25 #include <memory>
26 #include <unordered_map>
27 #include <vector>
28 #include "unicode/locid.h"
29 
30 #ifndef MINIKIN_HYPHENATOR_H
31 #define MINIKIN_HYPHENATOR_H
32 
33 namespace minikin {
34 
35 enum class HyphenationType : uint8_t {
36  // Note: There are implicit assumptions scattered in the code that DONT_BREAK
37  // is 0.
38 
39  // Do not break.
40  DONT_BREAK = 0,
41  // Break the line and insert a normal hyphen.
43  // Break the line and insert an Armenian hyphen (U+058A).
45  // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
47  // Break the line and insert a Canadian Syllabics hyphen (U+1400).
49  // Break the line, but don't insert a hyphen. Used for cases when there is
50  // already a hyphen
51  // present or the script does not use a hyphen (e.g. in Malayalam).
53  // Break and replace the last code unit with hyphen. Used for Catalan "l·l"
54  // which hyphenates
55  // as "l-/l".
57  // Break the line, and repeat the hyphen (which is the last character) at the
58  // beginning of the
59  // next line. Used in Polish, where "czerwono-niebieska" should hyphenate as
60  // "czerwono-/-niebieska".
62  // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the
63  // second line.
64  // This is used in Arabic script, mostly for writing systems of Central Asia.
65  // It's our default
66  // behavior when a soft hyphen is used in Arabic script.
68 };
69 
70 // The hyphen edit represents an edit to the string when a word is
71 // hyphenated. The most common hyphen edit is adding a "-" at the end
72 // of a syllable, but nonstandard hyphenation allows for more choices.
73 // Note that a HyphenEdit can hold two types of edits at the same time,
74 // One at the beginning of the string/line and one at the end.
75 class HyphenEdit {
76  public:
77  static const uint32_t NO_EDIT = 0x00;
78 
79  static const uint32_t INSERT_HYPHEN_AT_END = 0x01;
80  static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02;
81  static const uint32_t INSERT_MAQAF_AT_END = 0x03;
82  static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04;
83  static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05;
84  static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06;
85  static const uint32_t BREAK_AT_END = 0x07;
86 
87  static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3;
88  static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3;
89  static const uint32_t BREAK_AT_START = 0x03 << 3;
90 
91  // Keep in sync with the definitions in the Java code at:
92  // frameworks/base/graphics/java/android/graphics/Paint.java
93  static const uint32_t MASK_END_OF_LINE = 0x07;
94  static const uint32_t MASK_START_OF_LINE = 0x03 << 3;
95 
96  inline static bool isReplacement(uint32_t hyph) {
97  return hyph == REPLACE_WITH_HYPHEN_AT_END;
98  }
99 
100  inline static bool isInsertion(uint32_t hyph) {
101  return (hyph == INSERT_HYPHEN_AT_END ||
102  hyph == INSERT_ARMENIAN_HYPHEN_AT_END ||
103  hyph == INSERT_MAQAF_AT_END || hyph == INSERT_UCAS_HYPHEN_AT_END ||
104  hyph == INSERT_ZWJ_AND_HYPHEN_AT_END ||
105  hyph == INSERT_HYPHEN_AT_START || hyph == INSERT_ZWJ_AT_START);
106  }
107 
108  const static uint32_t* getHyphenString(uint32_t hyph);
109  static uint32_t editForThisLine(HyphenationType type);
110  static uint32_t editForNextLine(HyphenationType type);
111 
112  HyphenEdit() : hyphen(NO_EDIT) {}
113  HyphenEdit(uint32_t hyphenInt) : hyphen(hyphenInt) {} // NOLINT(implicit)
114  uint32_t getHyphen() const { return hyphen; }
115  bool operator==(const HyphenEdit& other) const {
116  return hyphen == other.hyphen;
117  }
118 
119  uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; }
120  uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; }
121 
122  private:
123  uint32_t hyphen;
124 };
125 
126 // hyb file header; implementation details are in the .cpp file
127 struct Header;
128 
129 class Hyphenator {
130  public:
131  // Compute the hyphenation of a word, storing the hyphenation in result
132  // vector. Each entry in the vector is a "hyphenation type" for a potential
133  // hyphenation that can be applied at the corresponding code unit offset in
134  // the word.
135  //
136  // Example: word is "hyphen", result is the following, corresponding to
137  // "hy-phen": [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK,
138  // DONT_BREAK, DONT_BREAK]
139  void hyphenate(std::vector<HyphenationType>* result,
140  const uint16_t* word,
141  size_t len,
142  const icu::Locale& locale);
143 
144  // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and
145  // usage: a character immediately after which line breaks are allowed, but
146  // words containing it should not be automatically hyphenated.
147  static bool isLineBreakingHyphen(uint32_t cp);
148 
149  // pattern data is in binary format, as described in doc/hyb_file_format.md.
150  // Note: the caller is responsible for ensuring that the lifetime of the
151  // pattern data is at least as long as the Hyphenator object.
152 
153  // Note: nullptr is valid input, in which case the hyphenator only processes
154  // soft hyphens.
155  static Hyphenator* loadBinary(const uint8_t* patternData,
156  size_t minPrefix,
157  size_t minSuffix);
158 
159  private:
160  // apply various hyphenation rules including hard and soft hyphens, ignoring
161  // patterns
162  void hyphenateWithNoPatterns(HyphenationType* result,
163  const uint16_t* word,
164  size_t len,
165  const icu::Locale& locale);
166 
167  // Try looking up word in alphabet table, return DONT_BREAK if any code units
168  // fail to map. Otherwise, returns BREAK_AND_INSERT_HYPHEN,
169  // BREAK_AND_INSERT_ARMENIAN_HYPHEN, or BREAK_AND_DONT_INSERT_HYPHEN based on
170  // the script of the characters seen. Note that this method writes len+2
171  // entries into alpha_codes (including start and stop)
172  HyphenationType alphabetLookup(uint16_t* alpha_codes,
173  const uint16_t* word,
174  size_t len);
175 
176  // calculate hyphenation from patterns, assuming alphabet lookup has already
177  // been done
178  void hyphenateFromCodes(HyphenationType* result,
179  const uint16_t* codes,
180  size_t len,
181  HyphenationType hyphenValue);
182 
183  // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is
184  // used so that temporary buffers can be stack-allocated without waste, which
185  // is a slightly different use case. It measures UTF-16 code units.
186  static const size_t MAX_HYPHENATED_SIZE = 64;
187 
188  const uint8_t* patternData;
189  size_t minPrefix, minSuffix;
190 
191  // accessors for binary data
192  const Header* getHeader() const {
193  return reinterpret_cast<const Header*>(patternData);
194  }
195 };
196 
197 } // namespace minikin
198 
199 #endif // MINIKIN_HYPHENATOR_H
uint32_t getHyphen() const
Definition: Hyphenator.h:114
HyphenEdit(uint32_t hyphenInt)
Definition: Hyphenator.h:113
static bool isInsertion(uint32_t hyph)
Definition: Hyphenator.h:100
uint32_t getStart() const
Definition: Hyphenator.h:120
uint32_t getEnd() const
Definition: Hyphenator.h:119
HyphenationType
Definition: Hyphenator.h:35
bool operator==(const HyphenEdit &other) const
Definition: Hyphenator.h:115
static bool isReplacement(uint32_t hyph)
Definition: Hyphenator.h:96