xref: /aosp_15_r20/external/skia/modules/skunicode/src/SkUnicode_libgrapheme.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2 * Copyright 2022 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7 
8 #include "modules/skunicode/include/SkUnicode_libgrapheme.h"
9 
10 #include "include/core/SkSpan.h"
11 #include "include/core/SkString.h"
12 #include "include/core/SkTypes.h"
13 #include "include/private/base/SkTArray.h"
14 #include "modules/skunicode/include/SkUnicode.h"
15 #include "modules/skunicode/src/SkBidiFactory_icu_subset.h"
16 #include "modules/skunicode/src/SkUnicode_hardcoded.h"
17 #include "modules/skunicode/src/SkUnicode_icu_bidi.h"
18 #include "src/base/SkBitmaskEnum.h"
19 
20 extern "C" {
21 #include <grapheme.h>
22 }
23 #include <array>
24 #include <memory>
25 #include <vector>
26 #include <unordered_map>
27 
28 using namespace skia_private;
29 
30 class SkUnicode_libgrapheme : public SkUnicodeHardCodedCharProperties {
31 public:
SkUnicode_libgrapheme()32     SkUnicode_libgrapheme() { }
33 
34     ~SkUnicode_libgrapheme() override = default;
35 
36     // For SkShaper
37     std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
38                                                      SkBidiIterator::Direction dir) override;
39     std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
40                                                      int count,
41                                                      SkBidiIterator::Direction dir) override;
42     std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
43                                                        BreakType breakType) override;
44     std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType breakType) override;
getBidiRegions(const char utf8[],int utf8Units,TextDirection dir,std::vector<BidiRegion> * results)45     bool getBidiRegions(const char utf8[],
46                         int utf8Units,
47                         TextDirection dir,
48                         std::vector<BidiRegion>* results) override {
49         return fBidiFact->ExtractBidi(utf8, utf8Units, dir, results);
50     }
51 
getSentences(const char utf8[],int utf8Units,const char * locale,std::vector<SkUnicode::Position> * results)52     bool getSentences(const char utf8[],
53                       int utf8Units,
54                       const char* locale,
55                       std::vector<SkUnicode::Position>* results) override {
56         SkDEBUGF("Method 'getSentences' is not implemented\n");
57         return false;
58     }
59 
computeCodeUnitFlags(char utf8[],int utf8Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)60     bool computeCodeUnitFlags(char utf8[],
61                               int utf8Units,
62                               bool replaceTabs,
63                               skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) override {
64         results->clear();
65         results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
66 
67         size_t lineBreak = 0;
68         (*results)[lineBreak] |= CodeUnitFlags::kSoftLineBreakBefore;
69         while (lineBreak < utf8Units) {
70             lineBreak += grapheme_next_line_break_utf8(utf8 + lineBreak, utf8Units - lineBreak);
71             // Check if the previous code unit is a hard break.
72             auto codePoint = utf8[lineBreak - 1];
73             (*results)[lineBreak] |= this->isHardBreak(codePoint)
74                                     ? CodeUnitFlags::kHardLineBreakBefore
75                                     : CodeUnitFlags::kSoftLineBreakBefore;
76         }
77         (*results)[utf8Units] |= CodeUnitFlags::kSoftLineBreakBefore;
78 
79         size_t graphemeBreak = 0;
80         (*results)[graphemeBreak] |= CodeUnitFlags::kGraphemeStart;
81         while (graphemeBreak < utf8Units) {
82             graphemeBreak += grapheme_next_character_break_utf8(utf8 + graphemeBreak, utf8Units - graphemeBreak);
83             (*results)[graphemeBreak] |= CodeUnitFlags::kGraphemeStart;
84         }
85 
86         const char* current = utf8;
87         const char* end = utf8 + utf8Units;
88         while (current < end) {
89             auto before = current - utf8;
90             SkUnichar unichar = SkUTF::NextUTF8(&current, end);
91             if (unichar < 0) unichar = 0xFFFD;
92             auto after = current - utf8;
93             if (replaceTabs && this->isTabulation(unichar)) {
94                 results->at(before) |= SkUnicode::kTabulation;
95                 if (replaceTabs) {
96                     unichar = ' ';
97                     utf8[before] = ' ';
98                 }
99             }
100             for (auto i = before; i < after; ++i) {
101                 if (this->isSpace(unichar)) {
102                     results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
103                 }
104                 if (this->isWhitespace(unichar)) {
105                     results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
106                 }
107                 if (this->isControl(unichar)) {
108                     results->at(i) |= SkUnicode::kControl;
109                 }
110             }
111         }
112         return true;
113     }
114 
computeCodeUnitFlags(char16_t utf16[],int utf16Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)115     bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
116                           skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) override {
117         SkASSERT(false);
118         return false;
119     }
120 
getUtf8To16Mapping(const char utf8[],int utf8Units,std::unordered_map<Position,Position> * results)121     bool getUtf8To16Mapping(const char utf8[], int utf8Units, std::unordered_map<Position, Position>* results) {
122         int utf16Units = 0;
123         const char* ptr8 = utf8;
124         const char* end8 = utf8 + utf8Units;
125         while (ptr8 < end8) {
126             results->emplace(ptr8 - utf8, utf16Units);
127             SkUnichar uni = SkUTF::NextUTF8(&ptr8, end8);
128             if (uni < 0) {
129                 return false;
130             }
131 
132             uint16_t utf16[2];
133             size_t count = SkUTF::ToUTF16(uni, utf16);
134             if (count == 0) {
135                 return false;
136             }
137             utf16Units += count;
138         }
139         results->emplace(utf8Units, utf16Units);
140         return true;
141     }
142 
getWords(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)143     bool getWords(const char utf8[], int utf8Units, const char* locale, std::vector<Position>* results) override {
144         std::unordered_map<Position, Position> mapping;
145         if (!getUtf8To16Mapping(utf8, utf8Units, &mapping)) {
146             return false;
147         }
148         size_t wordBreak = 0;
149         while (wordBreak < utf8Units) {
150             wordBreak += grapheme_next_word_break_utf8(utf8 + wordBreak, utf8Units - wordBreak);
151             if (mapping.find(wordBreak) == mapping.end()) {
152                 return false;
153             }
154             results->emplace_back(mapping[wordBreak]);
155         }
156         return true;
157     }
158 
getUtf8Words(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)159     bool getUtf8Words(const char utf8[],
160                       int utf8Units,
161                       const char* locale,
162                       std::vector<Position>* results) override {
163         // Let's consider sort line breaks, whitespaces and CJK codepoints instead
164         std::vector<CodeUnitFlags> breaks(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
165 
166         size_t lineBreak = 0;
167         breaks[lineBreak] = CodeUnitFlags::kSoftLineBreakBefore;
168         while (lineBreak < utf8Units) {
169             lineBreak += grapheme_next_line_break_utf8(utf8 + lineBreak, utf8Units - lineBreak);
170             breaks[lineBreak] = CodeUnitFlags::kSoftLineBreakBefore;
171         }
172         breaks[lineBreak] = CodeUnitFlags::kSoftLineBreakBefore;
173 
174         const char* current = utf8;
175         const char* end = utf8 + utf8Units;
176         while (current < end) {
177             auto index = current - utf8;
178             SkUnichar unichar = SkUTF::NextUTF8(&current, end);
179             if (this->isWhitespace(unichar)) {
180                 breaks[index] = CodeUnitFlags::kPartOfWhiteSpaceBreak;
181             } else if (this->isIdeographic(unichar)) {
182                 breaks[index] = CodeUnitFlags::kIdeographic;
183             }
184         }
185 
186         bool whitespaces = false;
187         for (size_t i = 0; i < breaks.size(); ++i) {
188             auto b = breaks[i];
189             if (b == CodeUnitFlags::kSoftLineBreakBefore) {
190                 results->emplace_back(i);
191                 whitespaces = false;
192             } else if (b == CodeUnitFlags::kIdeographic) {
193                 results->emplace_back(i);
194                 whitespaces = false;
195             } else if (b == CodeUnitFlags::kPartOfWhiteSpaceBreak) {
196                 if (!whitespaces) {
197                     results->emplace_back(i);
198                 }
199                 whitespaces = true;
200             } else {
201                 whitespaces = false;
202             }
203         }
204 
205         return true;
206 
207         /*
208         size_t wordBreak = 0;
209         while (wordBreak < utf8Units) {
210             wordBreak += grapheme_next_word_break_utf8(utf8 + wordBreak, utf8Units - wordBreak);
211             results->emplace_back(wordBreak);
212         }
213         return true;
214         */
215     }
216 
toUpper(const SkString & str)217     SkString toUpper(const SkString& str) override {
218         return this->toUpper(str, nullptr);
219     }
220 
toUpper(const SkString & str,const char * locale)221     SkString toUpper(const SkString& str, const char* locale) override {
222         SkString res(" ", str.size());
223         grapheme_to_uppercase_utf8(str.data(), str.size(), res.data(), res.size());
224         return res;
225     }
226 
reorderVisual(const BidiLevel runLevels[],int levelsCount,int32_t logicalFromVisual[])227     void reorderVisual(const BidiLevel runLevels[],
228                        int levelsCount,
229                        int32_t logicalFromVisual[]) override {
230         fBidiFact->bidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
231     }
232 private:
233     friend class SkBreakIterator_libgrapheme;
234 
235     sk_sp<SkBidiFactory> fBidiFact = sk_make_sp<SkBidiSubsetFactory>();
236 };
237 
238 class SkBreakIterator_libgrapheme: public SkBreakIterator {
239     SkUnicode_libgrapheme* fUnicode;
240     std::vector<SkUnicode::LineBreakBefore> fLineBreaks;
241     Position fLineBreakIndex;
242     static constexpr const int kDone = -1;
243 public:
SkBreakIterator_libgrapheme(SkUnicode_libgrapheme * unicode)244     explicit SkBreakIterator_libgrapheme(SkUnicode_libgrapheme* unicode) : fUnicode(unicode) { }
first()245     Position first() override
246       { return fLineBreaks[(fLineBreakIndex = 0)].pos; }
current()247     Position current() override
248       { return fLineBreaks[fLineBreakIndex].pos; }
next()249     Position next() override
250       { return fLineBreaks[++fLineBreakIndex].pos; }
status()251     Status status() override {
252         return fLineBreaks[fLineBreakIndex].breakType ==
253                        SkUnicode::LineBreakType::kHardLineBreak
254                        ? SkUnicode::CodeUnitFlags::kHardLineBreakBefore
255                        : SkUnicode::CodeUnitFlags::kSoftLineBreakBefore;
256     }
isDone()257     bool isDone() override { return fLineBreaks[fLineBreakIndex].pos == kDone; }
setText(const char utftext8[],int utf8Units)258     bool setText(const char utftext8[], int utf8Units) override {
259         fLineBreaks.clear();
260         size_t lineBreak = 0;
261         // first() must always go to the beginning of the string.
262         fLineBreaks.emplace_back(0, SkUnicode::LineBreakType::kHardLineBreak);
263         for (size_t pos = 0; pos < utf8Units;) {
264             pos += grapheme_next_line_break_utf8(utftext8 + pos, utf8Units - pos);
265             auto codePoint = utftext8[pos];
266             fLineBreaks.emplace_back(pos,
267                                      fUnicode->isHardBreak(codePoint)
268                                     ? SkUnicode::LineBreakType::kHardLineBreak
269                                     : SkUnicode::LineBreakType::kSoftLineBreak);
270         }
271         // There is always an "end" which signals "done".
272         fLineBreaks.emplace_back(kDone, SkUnicode::LineBreakType::kHardLineBreak);
273         fLineBreakIndex = 0;
274         return true;
275     }
setText(const char16_t utftext16[],int utf16Units)276     bool setText(const char16_t utftext16[], int utf16Units) override {
277         SkASSERT(false);
278         return false;
279     }
280 };
281 
makeBidiIterator(const uint16_t text[],int count,SkBidiIterator::Direction dir)282 std::unique_ptr<SkBidiIterator> SkUnicode_libgrapheme::makeBidiIterator(const uint16_t text[], int count,
283                                                  SkBidiIterator::Direction dir) {
284     return fBidiFact->MakeIterator(text, count, dir);
285 }
makeBidiIterator(const char text[],int count,SkBidiIterator::Direction dir)286 std::unique_ptr<SkBidiIterator> SkUnicode_libgrapheme::makeBidiIterator(const char text[],
287                                                  int count,
288                                                  SkBidiIterator::Direction dir) {
289     return fBidiFact->MakeIterator(text, count, dir);
290 }
makeBreakIterator(const char locale[],BreakType breakType)291 std::unique_ptr<SkBreakIterator> SkUnicode_libgrapheme::makeBreakIterator(const char locale[],
292                                                    BreakType breakType) {
293     return std::make_unique<SkBreakIterator_libgrapheme>(this);
294 }
makeBreakIterator(BreakType breakType)295 std::unique_ptr<SkBreakIterator> SkUnicode_libgrapheme::makeBreakIterator(BreakType breakType) {
296     return std::make_unique<SkBreakIterator_libgrapheme>(this);
297 }
298 
299 namespace SkUnicodes::Libgrapheme {
Make()300 sk_sp<SkUnicode> Make() {
301     return sk_make_sp<SkUnicode_libgrapheme>();
302 }
303 }
304