xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer-utils_test.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/tokenizer-utils.h"
18 
19 #include "gmock/gmock.h"
20 #include "gtest/gtest.h"
21 
22 namespace libtextclassifier3 {
23 namespace {
24 
TEST(TokenizerUtilTest,TokenizeOnSpace)25 TEST(TokenizerUtilTest, TokenizeOnSpace) {
26   std::vector<Token> tokens =
27       TokenizeOnSpace("Where is Jörg Borg located? Maybe in Zürich ...");
28 
29   EXPECT_EQ(tokens.size(), 9);
30 
31   EXPECT_EQ(tokens[0].value, "Where");
32   EXPECT_EQ(tokens[0].start, 0);
33   EXPECT_EQ(tokens[0].end, 5);
34 
35   EXPECT_EQ(tokens[1].value, "is");
36   EXPECT_EQ(tokens[1].start, 6);
37   EXPECT_EQ(tokens[1].end, 8);
38 
39   EXPECT_EQ(tokens[2].value, "Jörg");
40   EXPECT_EQ(tokens[2].start, 9);
41   EXPECT_EQ(tokens[2].end, 13);
42 
43   EXPECT_EQ(tokens[3].value, "Borg");
44   EXPECT_EQ(tokens[3].start, 14);
45   EXPECT_EQ(tokens[3].end, 18);
46 
47   EXPECT_EQ(tokens[4].value, "located?");
48   EXPECT_EQ(tokens[4].start, 19);
49   EXPECT_EQ(tokens[4].end, 27);
50 
51   EXPECT_EQ(tokens[5].value, "Maybe");
52   EXPECT_EQ(tokens[5].start, 28);
53   EXPECT_EQ(tokens[5].end, 33);
54 
55   EXPECT_EQ(tokens[6].value, "in");
56   EXPECT_EQ(tokens[6].start, 34);
57   EXPECT_EQ(tokens[6].end, 36);
58 
59   EXPECT_EQ(tokens[7].value, "Zürich");
60   EXPECT_EQ(tokens[7].start, 37);
61   EXPECT_EQ(tokens[7].end, 43);
62 
63   EXPECT_EQ(tokens[8].value, "...");
64   EXPECT_EQ(tokens[8].start, 44);
65   EXPECT_EQ(tokens[8].end, 47);
66 }
67 
TEST(TokenizerUtilTest,TokenizeOnDelimiters)68 TEST(TokenizerUtilTest, TokenizeOnDelimiters) {
69   std::vector<Token> tokens = TokenizeOnDelimiters(
70       "This   might be čomplíčateď?!: Oder?", {' ', '?', '!'});
71 
72   EXPECT_EQ(tokens.size(), 6);
73 
74   EXPECT_EQ(tokens[0].value, "This");
75   EXPECT_EQ(tokens[0].start, 0);
76   EXPECT_EQ(tokens[0].end, 4);
77 
78   EXPECT_EQ(tokens[1].value, "might");
79   EXPECT_EQ(tokens[1].start, 7);
80   EXPECT_EQ(tokens[1].end, 12);
81 
82   EXPECT_EQ(tokens[2].value, "be");
83   EXPECT_EQ(tokens[2].start, 13);
84   EXPECT_EQ(tokens[2].end, 15);
85 
86   EXPECT_EQ(tokens[3].value, "čomplíčateď");
87   EXPECT_EQ(tokens[3].start, 16);
88   EXPECT_EQ(tokens[3].end, 27);
89 
90   EXPECT_EQ(tokens[4].value, ":");
91   EXPECT_EQ(tokens[4].start, 29);
92   EXPECT_EQ(tokens[4].end, 30);
93 
94   EXPECT_EQ(tokens[5].value, "Oder");
95   EXPECT_EQ(tokens[5].start, 31);
96   EXPECT_EQ(tokens[5].end, 35);
97 }
98 
TEST(TokenizerUtilTest,TokenizeOnDelimitersKeepNoSpace)99 TEST(TokenizerUtilTest, TokenizeOnDelimitersKeepNoSpace) {
100   std::vector<Token> tokens = TokenizeOnDelimiters(
101       "This   might be čomplíčateď?!: Oder?", {' ', '?', '!'},
102       /* create_tokens_for_non_space_delimiters =*/true);
103 
104   EXPECT_EQ(tokens.size(), 9);
105 
106   EXPECT_EQ(tokens[0].value, "This");
107   EXPECT_EQ(tokens[0].start, 0);
108   EXPECT_EQ(tokens[0].end, 4);
109 
110   EXPECT_EQ(tokens[1].value, "might");
111   EXPECT_EQ(tokens[1].start, 7);
112   EXPECT_EQ(tokens[1].end, 12);
113 
114   EXPECT_EQ(tokens[2].value, "be");
115   EXPECT_EQ(tokens[2].start, 13);
116   EXPECT_EQ(tokens[2].end, 15);
117 
118   EXPECT_EQ(tokens[3].value, "čomplíčateď");
119   EXPECT_EQ(tokens[3].start, 16);
120   EXPECT_EQ(tokens[3].end, 27);
121 
122   EXPECT_EQ(tokens[4].value, "?");
123   EXPECT_EQ(tokens[4].start, 27);
124   EXPECT_EQ(tokens[4].end, 28);
125 
126   EXPECT_EQ(tokens[5].value, "!");
127   EXPECT_EQ(tokens[5].start, 28);
128   EXPECT_EQ(tokens[5].end, 29);
129 
130   EXPECT_EQ(tokens[6].value, ":");
131   EXPECT_EQ(tokens[6].start, 29);
132   EXPECT_EQ(tokens[6].end, 30);
133 
134   EXPECT_EQ(tokens[7].value, "Oder");
135   EXPECT_EQ(tokens[7].start, 31);
136   EXPECT_EQ(tokens[7].end, 35);
137 
138   EXPECT_EQ(tokens[8].value, "?");
139   EXPECT_EQ(tokens[8].start, 35);
140   EXPECT_EQ(tokens[8].end, 36);
141 }
142 
TEST(TokenizerUtilTest,SimpleEnglishWithPunctuation)143 TEST(TokenizerUtilTest, SimpleEnglishWithPunctuation) {
144   absl::string_view input = "I am fine, thanks!";
145 
146   std::vector<Token> tokens =
147       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
148 
149   EXPECT_THAT(tokens, testing::ElementsAreArray(
150                           {Token{"I", 0, 1}, Token{"am", 2, 4},
151                            Token{"fine", 5, 9}, Token{",", 9, 10},
152                            Token{"thanks", 11, 17}, Token{"!", 17, 18}}));
153 }
154 
TEST(TokenizerUtilTest,InputDoesNotEndWithDelimiter)155 TEST(TokenizerUtilTest, InputDoesNotEndWithDelimiter) {
156   absl::string_view input = "Good! Cool";
157 
158   std::vector<Token> tokens =
159       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
160 
161   EXPECT_THAT(tokens,
162               testing::ElementsAreArray({Token{"Good", 0, 4}, Token{"!", 4, 5},
163                                          Token{"Cool", 6, 10}}));
164 }
165 
TEST(TokenizerUtilTest,OnlySpace)166 TEST(TokenizerUtilTest, OnlySpace) {
167   absl::string_view input = "  \t";
168 
169   std::vector<Token> tokens =
170       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
171 
172   ASSERT_TRUE(tokens.empty());
173 }
174 
TEST(TokenizerUtilTest,Punctuation)175 TEST(TokenizerUtilTest, Punctuation) {
176   absl::string_view input = "!-/:-@[-`{-~";
177 
178   std::vector<Token> tokens =
179       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
180 
181   EXPECT_THAT(tokens,
182               testing::ElementsAreArray(
183                   {Token{"!", 0, 1}, Token{"-", 1, 2}, Token{"/", 2, 3},
184                    Token{":", 3, 4}, Token{"-", 4, 5}, Token{"@", 5, 6},
185                    Token{"[", 6, 7}, Token{"-", 7, 8}, Token{"`", 8, 9},
186                    Token{"{", 9, 10}, Token{"-", 10, 11}, Token{"~", 11, 12}}));
187 }
188 
TEST(TokenizerUtilTest,ChineseCharacters)189 TEST(TokenizerUtilTest, ChineseCharacters) {
190   absl::string_view input = "你好嗎三個字";
191 
192   std::vector<Token> tokens =
193       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
194 
195   EXPECT_THAT(tokens,
196               testing::ElementsAreArray(
197                   {Token{"你", 0, 1}, Token{"好", 1, 2}, Token{"嗎", 2, 3},
198                    Token{"三", 3, 4}, Token{"個", 4, 5}, Token{"字", 5, 6}}));
199 }
200 }  // namespace
201 }  // namespace libtextclassifier3
202