1 /*
2 * Copyright 2022 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "modules/skunicode/include/SkUnicode_libgrapheme.h"
9
10 #include "include/core/SkSpan.h"
11 #include "include/core/SkString.h"
12 #include "include/core/SkTypes.h"
13 #include "include/private/base/SkTArray.h"
14 #include "modules/skunicode/include/SkUnicode.h"
15 #include "modules/skunicode/src/SkBidiFactory_icu_subset.h"
16 #include "modules/skunicode/src/SkUnicode_hardcoded.h"
17 #include "modules/skunicode/src/SkUnicode_icu_bidi.h"
18 #include "src/base/SkBitmaskEnum.h"
19
20 extern "C" {
21 #include <grapheme.h>
22 }
23 #include <array>
24 #include <memory>
25 #include <vector>
26 #include <unordered_map>
27
28 using namespace skia_private;
29
30 class SkUnicode_libgrapheme : public SkUnicodeHardCodedCharProperties {
31 public:
SkUnicode_libgrapheme()32 SkUnicode_libgrapheme() { }
33
34 ~SkUnicode_libgrapheme() override = default;
35
36 // For SkShaper
37 std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
38 SkBidiIterator::Direction dir) override;
39 std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
40 int count,
41 SkBidiIterator::Direction dir) override;
42 std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
43 BreakType breakType) override;
44 std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType breakType) override;
getBidiRegions(const char utf8[],int utf8Units,TextDirection dir,std::vector<BidiRegion> * results)45 bool getBidiRegions(const char utf8[],
46 int utf8Units,
47 TextDirection dir,
48 std::vector<BidiRegion>* results) override {
49 return fBidiFact->ExtractBidi(utf8, utf8Units, dir, results);
50 }
51
getSentences(const char utf8[],int utf8Units,const char * locale,std::vector<SkUnicode::Position> * results)52 bool getSentences(const char utf8[],
53 int utf8Units,
54 const char* locale,
55 std::vector<SkUnicode::Position>* results) override {
56 SkDEBUGF("Method 'getSentences' is not implemented\n");
57 return false;
58 }
59
computeCodeUnitFlags(char utf8[],int utf8Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)60 bool computeCodeUnitFlags(char utf8[],
61 int utf8Units,
62 bool replaceTabs,
63 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) override {
64 results->clear();
65 results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
66
67 size_t lineBreak = 0;
68 (*results)[lineBreak] |= CodeUnitFlags::kSoftLineBreakBefore;
69 while (lineBreak < utf8Units) {
70 lineBreak += grapheme_next_line_break_utf8(utf8 + lineBreak, utf8Units - lineBreak);
71 // Check if the previous code unit is a hard break.
72 auto codePoint = utf8[lineBreak - 1];
73 (*results)[lineBreak] |= this->isHardBreak(codePoint)
74 ? CodeUnitFlags::kHardLineBreakBefore
75 : CodeUnitFlags::kSoftLineBreakBefore;
76 }
77 (*results)[utf8Units] |= CodeUnitFlags::kSoftLineBreakBefore;
78
79 size_t graphemeBreak = 0;
80 (*results)[graphemeBreak] |= CodeUnitFlags::kGraphemeStart;
81 while (graphemeBreak < utf8Units) {
82 graphemeBreak += grapheme_next_character_break_utf8(utf8 + graphemeBreak, utf8Units - graphemeBreak);
83 (*results)[graphemeBreak] |= CodeUnitFlags::kGraphemeStart;
84 }
85
86 const char* current = utf8;
87 const char* end = utf8 + utf8Units;
88 while (current < end) {
89 auto before = current - utf8;
90 SkUnichar unichar = SkUTF::NextUTF8(¤t, end);
91 if (unichar < 0) unichar = 0xFFFD;
92 auto after = current - utf8;
93 if (replaceTabs && this->isTabulation(unichar)) {
94 results->at(before) |= SkUnicode::kTabulation;
95 if (replaceTabs) {
96 unichar = ' ';
97 utf8[before] = ' ';
98 }
99 }
100 for (auto i = before; i < after; ++i) {
101 if (this->isSpace(unichar)) {
102 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
103 }
104 if (this->isWhitespace(unichar)) {
105 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
106 }
107 if (this->isControl(unichar)) {
108 results->at(i) |= SkUnicode::kControl;
109 }
110 }
111 }
112 return true;
113 }
114
computeCodeUnitFlags(char16_t utf16[],int utf16Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)115 bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
116 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) override {
117 SkASSERT(false);
118 return false;
119 }
120
getUtf8To16Mapping(const char utf8[],int utf8Units,std::unordered_map<Position,Position> * results)121 bool getUtf8To16Mapping(const char utf8[], int utf8Units, std::unordered_map<Position, Position>* results) {
122 int utf16Units = 0;
123 const char* ptr8 = utf8;
124 const char* end8 = utf8 + utf8Units;
125 while (ptr8 < end8) {
126 results->emplace(ptr8 - utf8, utf16Units);
127 SkUnichar uni = SkUTF::NextUTF8(&ptr8, end8);
128 if (uni < 0) {
129 return false;
130 }
131
132 uint16_t utf16[2];
133 size_t count = SkUTF::ToUTF16(uni, utf16);
134 if (count == 0) {
135 return false;
136 }
137 utf16Units += count;
138 }
139 results->emplace(utf8Units, utf16Units);
140 return true;
141 }
142
getWords(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)143 bool getWords(const char utf8[], int utf8Units, const char* locale, std::vector<Position>* results) override {
144 std::unordered_map<Position, Position> mapping;
145 if (!getUtf8To16Mapping(utf8, utf8Units, &mapping)) {
146 return false;
147 }
148 size_t wordBreak = 0;
149 while (wordBreak < utf8Units) {
150 wordBreak += grapheme_next_word_break_utf8(utf8 + wordBreak, utf8Units - wordBreak);
151 if (mapping.find(wordBreak) == mapping.end()) {
152 return false;
153 }
154 results->emplace_back(mapping[wordBreak]);
155 }
156 return true;
157 }
158
getUtf8Words(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)159 bool getUtf8Words(const char utf8[],
160 int utf8Units,
161 const char* locale,
162 std::vector<Position>* results) override {
163 // Let's consider sort line breaks, whitespaces and CJK codepoints instead
164 std::vector<CodeUnitFlags> breaks(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
165
166 size_t lineBreak = 0;
167 breaks[lineBreak] = CodeUnitFlags::kSoftLineBreakBefore;
168 while (lineBreak < utf8Units) {
169 lineBreak += grapheme_next_line_break_utf8(utf8 + lineBreak, utf8Units - lineBreak);
170 breaks[lineBreak] = CodeUnitFlags::kSoftLineBreakBefore;
171 }
172 breaks[lineBreak] = CodeUnitFlags::kSoftLineBreakBefore;
173
174 const char* current = utf8;
175 const char* end = utf8 + utf8Units;
176 while (current < end) {
177 auto index = current - utf8;
178 SkUnichar unichar = SkUTF::NextUTF8(¤t, end);
179 if (this->isWhitespace(unichar)) {
180 breaks[index] = CodeUnitFlags::kPartOfWhiteSpaceBreak;
181 } else if (this->isIdeographic(unichar)) {
182 breaks[index] = CodeUnitFlags::kIdeographic;
183 }
184 }
185
186 bool whitespaces = false;
187 for (size_t i = 0; i < breaks.size(); ++i) {
188 auto b = breaks[i];
189 if (b == CodeUnitFlags::kSoftLineBreakBefore) {
190 results->emplace_back(i);
191 whitespaces = false;
192 } else if (b == CodeUnitFlags::kIdeographic) {
193 results->emplace_back(i);
194 whitespaces = false;
195 } else if (b == CodeUnitFlags::kPartOfWhiteSpaceBreak) {
196 if (!whitespaces) {
197 results->emplace_back(i);
198 }
199 whitespaces = true;
200 } else {
201 whitespaces = false;
202 }
203 }
204
205 return true;
206
207 /*
208 size_t wordBreak = 0;
209 while (wordBreak < utf8Units) {
210 wordBreak += grapheme_next_word_break_utf8(utf8 + wordBreak, utf8Units - wordBreak);
211 results->emplace_back(wordBreak);
212 }
213 return true;
214 */
215 }
216
toUpper(const SkString & str)217 SkString toUpper(const SkString& str) override {
218 return this->toUpper(str, nullptr);
219 }
220
toUpper(const SkString & str,const char * locale)221 SkString toUpper(const SkString& str, const char* locale) override {
222 SkString res(" ", str.size());
223 grapheme_to_uppercase_utf8(str.data(), str.size(), res.data(), res.size());
224 return res;
225 }
226
reorderVisual(const BidiLevel runLevels[],int levelsCount,int32_t logicalFromVisual[])227 void reorderVisual(const BidiLevel runLevels[],
228 int levelsCount,
229 int32_t logicalFromVisual[]) override {
230 fBidiFact->bidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
231 }
232 private:
233 friend class SkBreakIterator_libgrapheme;
234
235 sk_sp<SkBidiFactory> fBidiFact = sk_make_sp<SkBidiSubsetFactory>();
236 };
237
238 class SkBreakIterator_libgrapheme: public SkBreakIterator {
239 SkUnicode_libgrapheme* fUnicode;
240 std::vector<SkUnicode::LineBreakBefore> fLineBreaks;
241 Position fLineBreakIndex;
242 static constexpr const int kDone = -1;
243 public:
SkBreakIterator_libgrapheme(SkUnicode_libgrapheme * unicode)244 explicit SkBreakIterator_libgrapheme(SkUnicode_libgrapheme* unicode) : fUnicode(unicode) { }
first()245 Position first() override
246 { return fLineBreaks[(fLineBreakIndex = 0)].pos; }
current()247 Position current() override
248 { return fLineBreaks[fLineBreakIndex].pos; }
next()249 Position next() override
250 { return fLineBreaks[++fLineBreakIndex].pos; }
status()251 Status status() override {
252 return fLineBreaks[fLineBreakIndex].breakType ==
253 SkUnicode::LineBreakType::kHardLineBreak
254 ? SkUnicode::CodeUnitFlags::kHardLineBreakBefore
255 : SkUnicode::CodeUnitFlags::kSoftLineBreakBefore;
256 }
isDone()257 bool isDone() override { return fLineBreaks[fLineBreakIndex].pos == kDone; }
setText(const char utftext8[],int utf8Units)258 bool setText(const char utftext8[], int utf8Units) override {
259 fLineBreaks.clear();
260 size_t lineBreak = 0;
261 // first() must always go to the beginning of the string.
262 fLineBreaks.emplace_back(0, SkUnicode::LineBreakType::kHardLineBreak);
263 for (size_t pos = 0; pos < utf8Units;) {
264 pos += grapheme_next_line_break_utf8(utftext8 + pos, utf8Units - pos);
265 auto codePoint = utftext8[pos];
266 fLineBreaks.emplace_back(pos,
267 fUnicode->isHardBreak(codePoint)
268 ? SkUnicode::LineBreakType::kHardLineBreak
269 : SkUnicode::LineBreakType::kSoftLineBreak);
270 }
271 // There is always an "end" which signals "done".
272 fLineBreaks.emplace_back(kDone, SkUnicode::LineBreakType::kHardLineBreak);
273 fLineBreakIndex = 0;
274 return true;
275 }
setText(const char16_t utftext16[],int utf16Units)276 bool setText(const char16_t utftext16[], int utf16Units) override {
277 SkASSERT(false);
278 return false;
279 }
280 };
281
makeBidiIterator(const uint16_t text[],int count,SkBidiIterator::Direction dir)282 std::unique_ptr<SkBidiIterator> SkUnicode_libgrapheme::makeBidiIterator(const uint16_t text[], int count,
283 SkBidiIterator::Direction dir) {
284 return fBidiFact->MakeIterator(text, count, dir);
285 }
makeBidiIterator(const char text[],int count,SkBidiIterator::Direction dir)286 std::unique_ptr<SkBidiIterator> SkUnicode_libgrapheme::makeBidiIterator(const char text[],
287 int count,
288 SkBidiIterator::Direction dir) {
289 return fBidiFact->MakeIterator(text, count, dir);
290 }
makeBreakIterator(const char locale[],BreakType breakType)291 std::unique_ptr<SkBreakIterator> SkUnicode_libgrapheme::makeBreakIterator(const char locale[],
292 BreakType breakType) {
293 return std::make_unique<SkBreakIterator_libgrapheme>(this);
294 }
makeBreakIterator(BreakType breakType)295 std::unique_ptr<SkBreakIterator> SkUnicode_libgrapheme::makeBreakIterator(BreakType breakType) {
296 return std::make_unique<SkBreakIterator_libgrapheme>(this);
297 }
298
299 namespace SkUnicodes::Libgrapheme {
Make()300 sk_sp<SkUnicode> Make() {
301 return sk_make_sp<SkUnicode_libgrapheme>();
302 }
303 }
304