xref: /aosp_15_r20/external/libtextclassifier/native/utils/tokenizer-utils_test.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker  * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker  *
4*993b0882SAndroid Build Coastguard Worker  * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker  * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker  * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker  *
8*993b0882SAndroid Build Coastguard Worker  *      http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker  *
10*993b0882SAndroid Build Coastguard Worker  * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker  * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker  * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker  * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker  */
16*993b0882SAndroid Build Coastguard Worker 
17*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer-utils.h"
18*993b0882SAndroid Build Coastguard Worker 
19*993b0882SAndroid Build Coastguard Worker #include "gmock/gmock.h"
20*993b0882SAndroid Build Coastguard Worker #include "gtest/gtest.h"
21*993b0882SAndroid Build Coastguard Worker 
22*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
23*993b0882SAndroid Build Coastguard Worker namespace {
24*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerUtilTest,TokenizeOnSpace)25*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, TokenizeOnSpace) {
26*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens =
27*993b0882SAndroid Build Coastguard Worker       TokenizeOnSpace("Where is Jörg Borg located? Maybe in Zürich ...");
28*993b0882SAndroid Build Coastguard Worker 
29*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens.size(), 9);
30*993b0882SAndroid Build Coastguard Worker 
31*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].value, "Where");
32*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].start, 0);
33*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].end, 5);
34*993b0882SAndroid Build Coastguard Worker 
35*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].value, "is");
36*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].start, 6);
37*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].end, 8);
38*993b0882SAndroid Build Coastguard Worker 
39*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].value, "Jörg");
40*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].start, 9);
41*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].end, 13);
42*993b0882SAndroid Build Coastguard Worker 
43*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].value, "Borg");
44*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].start, 14);
45*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].end, 18);
46*993b0882SAndroid Build Coastguard Worker 
47*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].value, "located?");
48*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].start, 19);
49*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].end, 27);
50*993b0882SAndroid Build Coastguard Worker 
51*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].value, "Maybe");
52*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].start, 28);
53*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].end, 33);
54*993b0882SAndroid Build Coastguard Worker 
55*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[6].value, "in");
56*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[6].start, 34);
57*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[6].end, 36);
58*993b0882SAndroid Build Coastguard Worker 
59*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[7].value, "Zürich");
60*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[7].start, 37);
61*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[7].end, 43);
62*993b0882SAndroid Build Coastguard Worker 
63*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[8].value, "...");
64*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[8].start, 44);
65*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[8].end, 47);
66*993b0882SAndroid Build Coastguard Worker }
67*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerUtilTest,TokenizeOnDelimiters)68*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, TokenizeOnDelimiters) {
69*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = TokenizeOnDelimiters(
70*993b0882SAndroid Build Coastguard Worker       "This   might be čomplíčateď?!: Oder?", {' ', '?', '!'});
71*993b0882SAndroid Build Coastguard Worker 
72*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens.size(), 6);
73*993b0882SAndroid Build Coastguard Worker 
74*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].value, "This");
75*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].start, 0);
76*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].end, 4);
77*993b0882SAndroid Build Coastguard Worker 
78*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].value, "might");
79*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].start, 7);
80*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].end, 12);
81*993b0882SAndroid Build Coastguard Worker 
82*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].value, "be");
83*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].start, 13);
84*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].end, 15);
85*993b0882SAndroid Build Coastguard Worker 
86*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].value, "čomplíčateď");
87*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].start, 16);
88*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].end, 27);
89*993b0882SAndroid Build Coastguard Worker 
90*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].value, ":");
91*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].start, 29);
92*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].end, 30);
93*993b0882SAndroid Build Coastguard Worker 
94*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].value, "Oder");
95*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].start, 31);
96*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].end, 35);
97*993b0882SAndroid Build Coastguard Worker }
98*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerUtilTest,TokenizeOnDelimitersKeepNoSpace)99*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, TokenizeOnDelimitersKeepNoSpace) {
100*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens = TokenizeOnDelimiters(
101*993b0882SAndroid Build Coastguard Worker       "This   might be čomplíčateď?!: Oder?", {' ', '?', '!'},
102*993b0882SAndroid Build Coastguard Worker       /* create_tokens_for_non_space_delimiters =*/true);
103*993b0882SAndroid Build Coastguard Worker 
104*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens.size(), 9);
105*993b0882SAndroid Build Coastguard Worker 
106*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].value, "This");
107*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].start, 0);
108*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[0].end, 4);
109*993b0882SAndroid Build Coastguard Worker 
110*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].value, "might");
111*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].start, 7);
112*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[1].end, 12);
113*993b0882SAndroid Build Coastguard Worker 
114*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].value, "be");
115*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].start, 13);
116*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[2].end, 15);
117*993b0882SAndroid Build Coastguard Worker 
118*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].value, "čomplíčateď");
119*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].start, 16);
120*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[3].end, 27);
121*993b0882SAndroid Build Coastguard Worker 
122*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].value, "?");
123*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].start, 27);
124*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[4].end, 28);
125*993b0882SAndroid Build Coastguard Worker 
126*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].value, "!");
127*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].start, 28);
128*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[5].end, 29);
129*993b0882SAndroid Build Coastguard Worker 
130*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[6].value, ":");
131*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[6].start, 29);
132*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[6].end, 30);
133*993b0882SAndroid Build Coastguard Worker 
134*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[7].value, "Oder");
135*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[7].start, 31);
136*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[7].end, 35);
137*993b0882SAndroid Build Coastguard Worker 
138*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[8].value, "?");
139*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[8].start, 35);
140*993b0882SAndroid Build Coastguard Worker   EXPECT_EQ(tokens[8].end, 36);
141*993b0882SAndroid Build Coastguard Worker }
142*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerUtilTest,SimpleEnglishWithPunctuation)143*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, SimpleEnglishWithPunctuation) {
144*993b0882SAndroid Build Coastguard Worker   absl::string_view input = "I am fine, thanks!";
145*993b0882SAndroid Build Coastguard Worker 
146*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens =
147*993b0882SAndroid Build Coastguard Worker       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
148*993b0882SAndroid Build Coastguard Worker 
149*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(tokens, testing::ElementsAreArray(
150*993b0882SAndroid Build Coastguard Worker                           {Token{"I", 0, 1}, Token{"am", 2, 4},
151*993b0882SAndroid Build Coastguard Worker                            Token{"fine", 5, 9}, Token{",", 9, 10},
152*993b0882SAndroid Build Coastguard Worker                            Token{"thanks", 11, 17}, Token{"!", 17, 18}}));
153*993b0882SAndroid Build Coastguard Worker }
154*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerUtilTest,InputDoesNotEndWithDelimiter)155*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, InputDoesNotEndWithDelimiter) {
156*993b0882SAndroid Build Coastguard Worker   absl::string_view input = "Good! Cool";
157*993b0882SAndroid Build Coastguard Worker 
158*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens =
159*993b0882SAndroid Build Coastguard Worker       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
160*993b0882SAndroid Build Coastguard Worker 
161*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(tokens,
162*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray({Token{"Good", 0, 4}, Token{"!", 4, 5},
163*993b0882SAndroid Build Coastguard Worker                                          Token{"Cool", 6, 10}}));
164*993b0882SAndroid Build Coastguard Worker }
165*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerUtilTest,OnlySpace)166*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, OnlySpace) {
167*993b0882SAndroid Build Coastguard Worker   absl::string_view input = "  \t";
168*993b0882SAndroid Build Coastguard Worker 
169*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens =
170*993b0882SAndroid Build Coastguard Worker       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
171*993b0882SAndroid Build Coastguard Worker 
172*993b0882SAndroid Build Coastguard Worker   ASSERT_TRUE(tokens.empty());
173*993b0882SAndroid Build Coastguard Worker }
174*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerUtilTest,Punctuation)175*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, Punctuation) {
176*993b0882SAndroid Build Coastguard Worker   absl::string_view input = "!-/:-@[-`{-~";
177*993b0882SAndroid Build Coastguard Worker 
178*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens =
179*993b0882SAndroid Build Coastguard Worker       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
180*993b0882SAndroid Build Coastguard Worker 
181*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(tokens,
182*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray(
183*993b0882SAndroid Build Coastguard Worker                   {Token{"!", 0, 1}, Token{"-", 1, 2}, Token{"/", 2, 3},
184*993b0882SAndroid Build Coastguard Worker                    Token{":", 3, 4}, Token{"-", 4, 5}, Token{"@", 5, 6},
185*993b0882SAndroid Build Coastguard Worker                    Token{"[", 6, 7}, Token{"-", 7, 8}, Token{"`", 8, 9},
186*993b0882SAndroid Build Coastguard Worker                    Token{"{", 9, 10}, Token{"-", 10, 11}, Token{"~", 11, 12}}));
187*993b0882SAndroid Build Coastguard Worker }
188*993b0882SAndroid Build Coastguard Worker 
TEST(TokenizerUtilTest,ChineseCharacters)189*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, ChineseCharacters) {
190*993b0882SAndroid Build Coastguard Worker   absl::string_view input = "你好嗎三個字";
191*993b0882SAndroid Build Coastguard Worker 
192*993b0882SAndroid Build Coastguard Worker   std::vector<Token> tokens =
193*993b0882SAndroid Build Coastguard Worker       TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
194*993b0882SAndroid Build Coastguard Worker 
195*993b0882SAndroid Build Coastguard Worker   EXPECT_THAT(tokens,
196*993b0882SAndroid Build Coastguard Worker               testing::ElementsAreArray(
197*993b0882SAndroid Build Coastguard Worker                   {Token{"你", 0, 1}, Token{"好", 1, 2}, Token{"嗎", 2, 3},
198*993b0882SAndroid Build Coastguard Worker                    Token{"三", 3, 4}, Token{"個", 4, 5}, Token{"字", 5, 6}}));
199*993b0882SAndroid Build Coastguard Worker }
200*993b0882SAndroid Build Coastguard Worker }  // namespace
201*993b0882SAndroid Build Coastguard Worker }  // namespace libtextclassifier3
202