1*993b0882SAndroid Build Coastguard Worker /*
2*993b0882SAndroid Build Coastguard Worker * Copyright (C) 2018 The Android Open Source Project
3*993b0882SAndroid Build Coastguard Worker *
4*993b0882SAndroid Build Coastguard Worker * Licensed under the Apache License, Version 2.0 (the "License");
5*993b0882SAndroid Build Coastguard Worker * you may not use this file except in compliance with the License.
6*993b0882SAndroid Build Coastguard Worker * You may obtain a copy of the License at
7*993b0882SAndroid Build Coastguard Worker *
8*993b0882SAndroid Build Coastguard Worker * http://www.apache.org/licenses/LICENSE-2.0
9*993b0882SAndroid Build Coastguard Worker *
10*993b0882SAndroid Build Coastguard Worker * Unless required by applicable law or agreed to in writing, software
11*993b0882SAndroid Build Coastguard Worker * distributed under the License is distributed on an "AS IS" BASIS,
12*993b0882SAndroid Build Coastguard Worker * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13*993b0882SAndroid Build Coastguard Worker * See the License for the specific language governing permissions and
14*993b0882SAndroid Build Coastguard Worker * limitations under the License.
15*993b0882SAndroid Build Coastguard Worker */
16*993b0882SAndroid Build Coastguard Worker
17*993b0882SAndroid Build Coastguard Worker #include "utils/tokenizer-utils.h"
18*993b0882SAndroid Build Coastguard Worker
19*993b0882SAndroid Build Coastguard Worker #include "gmock/gmock.h"
20*993b0882SAndroid Build Coastguard Worker #include "gtest/gtest.h"
21*993b0882SAndroid Build Coastguard Worker
22*993b0882SAndroid Build Coastguard Worker namespace libtextclassifier3 {
23*993b0882SAndroid Build Coastguard Worker namespace {
24*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerUtilTest,TokenizeOnSpace)25*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, TokenizeOnSpace) {
26*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens =
27*993b0882SAndroid Build Coastguard Worker TokenizeOnSpace("Where is Jörg Borg located? Maybe in Zürich ...");
28*993b0882SAndroid Build Coastguard Worker
29*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens.size(), 9);
30*993b0882SAndroid Build Coastguard Worker
31*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].value, "Where");
32*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].start, 0);
33*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].end, 5);
34*993b0882SAndroid Build Coastguard Worker
35*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].value, "is");
36*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].start, 6);
37*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].end, 8);
38*993b0882SAndroid Build Coastguard Worker
39*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].value, "Jörg");
40*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].start, 9);
41*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].end, 13);
42*993b0882SAndroid Build Coastguard Worker
43*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].value, "Borg");
44*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].start, 14);
45*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].end, 18);
46*993b0882SAndroid Build Coastguard Worker
47*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].value, "located?");
48*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].start, 19);
49*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].end, 27);
50*993b0882SAndroid Build Coastguard Worker
51*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].value, "Maybe");
52*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].start, 28);
53*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].end, 33);
54*993b0882SAndroid Build Coastguard Worker
55*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[6].value, "in");
56*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[6].start, 34);
57*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[6].end, 36);
58*993b0882SAndroid Build Coastguard Worker
59*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[7].value, "Zürich");
60*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[7].start, 37);
61*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[7].end, 43);
62*993b0882SAndroid Build Coastguard Worker
63*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[8].value, "...");
64*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[8].start, 44);
65*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[8].end, 47);
66*993b0882SAndroid Build Coastguard Worker }
67*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerUtilTest,TokenizeOnDelimiters)68*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, TokenizeOnDelimiters) {
69*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = TokenizeOnDelimiters(
70*993b0882SAndroid Build Coastguard Worker "This might be čomplíčateď?!: Oder?", {' ', '?', '!'});
71*993b0882SAndroid Build Coastguard Worker
72*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens.size(), 6);
73*993b0882SAndroid Build Coastguard Worker
74*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].value, "This");
75*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].start, 0);
76*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].end, 4);
77*993b0882SAndroid Build Coastguard Worker
78*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].value, "might");
79*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].start, 7);
80*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].end, 12);
81*993b0882SAndroid Build Coastguard Worker
82*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].value, "be");
83*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].start, 13);
84*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].end, 15);
85*993b0882SAndroid Build Coastguard Worker
86*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].value, "čomplíčateď");
87*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].start, 16);
88*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].end, 27);
89*993b0882SAndroid Build Coastguard Worker
90*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].value, ":");
91*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].start, 29);
92*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].end, 30);
93*993b0882SAndroid Build Coastguard Worker
94*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].value, "Oder");
95*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].start, 31);
96*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].end, 35);
97*993b0882SAndroid Build Coastguard Worker }
98*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerUtilTest,TokenizeOnDelimitersKeepNoSpace)99*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, TokenizeOnDelimitersKeepNoSpace) {
100*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens = TokenizeOnDelimiters(
101*993b0882SAndroid Build Coastguard Worker "This might be čomplíčateď?!: Oder?", {' ', '?', '!'},
102*993b0882SAndroid Build Coastguard Worker /* create_tokens_for_non_space_delimiters =*/true);
103*993b0882SAndroid Build Coastguard Worker
104*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens.size(), 9);
105*993b0882SAndroid Build Coastguard Worker
106*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].value, "This");
107*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].start, 0);
108*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[0].end, 4);
109*993b0882SAndroid Build Coastguard Worker
110*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].value, "might");
111*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].start, 7);
112*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[1].end, 12);
113*993b0882SAndroid Build Coastguard Worker
114*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].value, "be");
115*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].start, 13);
116*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[2].end, 15);
117*993b0882SAndroid Build Coastguard Worker
118*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].value, "čomplíčateď");
119*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].start, 16);
120*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[3].end, 27);
121*993b0882SAndroid Build Coastguard Worker
122*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].value, "?");
123*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].start, 27);
124*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[4].end, 28);
125*993b0882SAndroid Build Coastguard Worker
126*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].value, "!");
127*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].start, 28);
128*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[5].end, 29);
129*993b0882SAndroid Build Coastguard Worker
130*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[6].value, ":");
131*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[6].start, 29);
132*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[6].end, 30);
133*993b0882SAndroid Build Coastguard Worker
134*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[7].value, "Oder");
135*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[7].start, 31);
136*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[7].end, 35);
137*993b0882SAndroid Build Coastguard Worker
138*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[8].value, "?");
139*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[8].start, 35);
140*993b0882SAndroid Build Coastguard Worker EXPECT_EQ(tokens[8].end, 36);
141*993b0882SAndroid Build Coastguard Worker }
142*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerUtilTest,SimpleEnglishWithPunctuation)143*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, SimpleEnglishWithPunctuation) {
144*993b0882SAndroid Build Coastguard Worker absl::string_view input = "I am fine, thanks!";
145*993b0882SAndroid Build Coastguard Worker
146*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens =
147*993b0882SAndroid Build Coastguard Worker TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
148*993b0882SAndroid Build Coastguard Worker
149*993b0882SAndroid Build Coastguard Worker EXPECT_THAT(tokens, testing::ElementsAreArray(
150*993b0882SAndroid Build Coastguard Worker {Token{"I", 0, 1}, Token{"am", 2, 4},
151*993b0882SAndroid Build Coastguard Worker Token{"fine", 5, 9}, Token{",", 9, 10},
152*993b0882SAndroid Build Coastguard Worker Token{"thanks", 11, 17}, Token{"!", 17, 18}}));
153*993b0882SAndroid Build Coastguard Worker }
154*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerUtilTest,InputDoesNotEndWithDelimiter)155*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, InputDoesNotEndWithDelimiter) {
156*993b0882SAndroid Build Coastguard Worker absl::string_view input = "Good! Cool";
157*993b0882SAndroid Build Coastguard Worker
158*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens =
159*993b0882SAndroid Build Coastguard Worker TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
160*993b0882SAndroid Build Coastguard Worker
161*993b0882SAndroid Build Coastguard Worker EXPECT_THAT(tokens,
162*993b0882SAndroid Build Coastguard Worker testing::ElementsAreArray({Token{"Good", 0, 4}, Token{"!", 4, 5},
163*993b0882SAndroid Build Coastguard Worker Token{"Cool", 6, 10}}));
164*993b0882SAndroid Build Coastguard Worker }
165*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerUtilTest,OnlySpace)166*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, OnlySpace) {
167*993b0882SAndroid Build Coastguard Worker absl::string_view input = " \t";
168*993b0882SAndroid Build Coastguard Worker
169*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens =
170*993b0882SAndroid Build Coastguard Worker TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
171*993b0882SAndroid Build Coastguard Worker
172*993b0882SAndroid Build Coastguard Worker ASSERT_TRUE(tokens.empty());
173*993b0882SAndroid Build Coastguard Worker }
174*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerUtilTest,Punctuation)175*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, Punctuation) {
176*993b0882SAndroid Build Coastguard Worker absl::string_view input = "!-/:-@[-`{-~";
177*993b0882SAndroid Build Coastguard Worker
178*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens =
179*993b0882SAndroid Build Coastguard Worker TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
180*993b0882SAndroid Build Coastguard Worker
181*993b0882SAndroid Build Coastguard Worker EXPECT_THAT(tokens,
182*993b0882SAndroid Build Coastguard Worker testing::ElementsAreArray(
183*993b0882SAndroid Build Coastguard Worker {Token{"!", 0, 1}, Token{"-", 1, 2}, Token{"/", 2, 3},
184*993b0882SAndroid Build Coastguard Worker Token{":", 3, 4}, Token{"-", 4, 5}, Token{"@", 5, 6},
185*993b0882SAndroid Build Coastguard Worker Token{"[", 6, 7}, Token{"-", 7, 8}, Token{"`", 8, 9},
186*993b0882SAndroid Build Coastguard Worker Token{"{", 9, 10}, Token{"-", 10, 11}, Token{"~", 11, 12}}));
187*993b0882SAndroid Build Coastguard Worker }
188*993b0882SAndroid Build Coastguard Worker
TEST(TokenizerUtilTest,ChineseCharacters)189*993b0882SAndroid Build Coastguard Worker TEST(TokenizerUtilTest, ChineseCharacters) {
190*993b0882SAndroid Build Coastguard Worker absl::string_view input = "你好嗎三個字";
191*993b0882SAndroid Build Coastguard Worker
192*993b0882SAndroid Build Coastguard Worker std::vector<Token> tokens =
193*993b0882SAndroid Build Coastguard Worker TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
194*993b0882SAndroid Build Coastguard Worker
195*993b0882SAndroid Build Coastguard Worker EXPECT_THAT(tokens,
196*993b0882SAndroid Build Coastguard Worker testing::ElementsAreArray(
197*993b0882SAndroid Build Coastguard Worker {Token{"你", 0, 1}, Token{"好", 1, 2}, Token{"嗎", 2, 3},
198*993b0882SAndroid Build Coastguard Worker Token{"三", 3, 4}, Token{"個", 4, 5}, Token{"字", 5, 6}}));
199*993b0882SAndroid Build Coastguard Worker }
200*993b0882SAndroid Build Coastguard Worker } // namespace
201*993b0882SAndroid Build Coastguard Worker } // namespace libtextclassifier3
202