1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/tokenizer-utils.h"
18
19 #include "gmock/gmock.h"
20 #include "gtest/gtest.h"
21
22 namespace libtextclassifier3 {
23 namespace {
24
TEST(TokenizerUtilTest,TokenizeOnSpace)25 TEST(TokenizerUtilTest, TokenizeOnSpace) {
26 std::vector<Token> tokens =
27 TokenizeOnSpace("Where is Jörg Borg located? Maybe in Zürich ...");
28
29 EXPECT_EQ(tokens.size(), 9);
30
31 EXPECT_EQ(tokens[0].value, "Where");
32 EXPECT_EQ(tokens[0].start, 0);
33 EXPECT_EQ(tokens[0].end, 5);
34
35 EXPECT_EQ(tokens[1].value, "is");
36 EXPECT_EQ(tokens[1].start, 6);
37 EXPECT_EQ(tokens[1].end, 8);
38
39 EXPECT_EQ(tokens[2].value, "Jörg");
40 EXPECT_EQ(tokens[2].start, 9);
41 EXPECT_EQ(tokens[2].end, 13);
42
43 EXPECT_EQ(tokens[3].value, "Borg");
44 EXPECT_EQ(tokens[3].start, 14);
45 EXPECT_EQ(tokens[3].end, 18);
46
47 EXPECT_EQ(tokens[4].value, "located?");
48 EXPECT_EQ(tokens[4].start, 19);
49 EXPECT_EQ(tokens[4].end, 27);
50
51 EXPECT_EQ(tokens[5].value, "Maybe");
52 EXPECT_EQ(tokens[5].start, 28);
53 EXPECT_EQ(tokens[5].end, 33);
54
55 EXPECT_EQ(tokens[6].value, "in");
56 EXPECT_EQ(tokens[6].start, 34);
57 EXPECT_EQ(tokens[6].end, 36);
58
59 EXPECT_EQ(tokens[7].value, "Zürich");
60 EXPECT_EQ(tokens[7].start, 37);
61 EXPECT_EQ(tokens[7].end, 43);
62
63 EXPECT_EQ(tokens[8].value, "...");
64 EXPECT_EQ(tokens[8].start, 44);
65 EXPECT_EQ(tokens[8].end, 47);
66 }
67
TEST(TokenizerUtilTest,TokenizeOnDelimiters)68 TEST(TokenizerUtilTest, TokenizeOnDelimiters) {
69 std::vector<Token> tokens = TokenizeOnDelimiters(
70 "This might be čomplíčateď?!: Oder?", {' ', '?', '!'});
71
72 EXPECT_EQ(tokens.size(), 6);
73
74 EXPECT_EQ(tokens[0].value, "This");
75 EXPECT_EQ(tokens[0].start, 0);
76 EXPECT_EQ(tokens[0].end, 4);
77
78 EXPECT_EQ(tokens[1].value, "might");
79 EXPECT_EQ(tokens[1].start, 7);
80 EXPECT_EQ(tokens[1].end, 12);
81
82 EXPECT_EQ(tokens[2].value, "be");
83 EXPECT_EQ(tokens[2].start, 13);
84 EXPECT_EQ(tokens[2].end, 15);
85
86 EXPECT_EQ(tokens[3].value, "čomplíčateď");
87 EXPECT_EQ(tokens[3].start, 16);
88 EXPECT_EQ(tokens[3].end, 27);
89
90 EXPECT_EQ(tokens[4].value, ":");
91 EXPECT_EQ(tokens[4].start, 29);
92 EXPECT_EQ(tokens[4].end, 30);
93
94 EXPECT_EQ(tokens[5].value, "Oder");
95 EXPECT_EQ(tokens[5].start, 31);
96 EXPECT_EQ(tokens[5].end, 35);
97 }
98
TEST(TokenizerUtilTest,TokenizeOnDelimitersKeepNoSpace)99 TEST(TokenizerUtilTest, TokenizeOnDelimitersKeepNoSpace) {
100 std::vector<Token> tokens = TokenizeOnDelimiters(
101 "This might be čomplíčateď?!: Oder?", {' ', '?', '!'},
102 /* create_tokens_for_non_space_delimiters =*/true);
103
104 EXPECT_EQ(tokens.size(), 9);
105
106 EXPECT_EQ(tokens[0].value, "This");
107 EXPECT_EQ(tokens[0].start, 0);
108 EXPECT_EQ(tokens[0].end, 4);
109
110 EXPECT_EQ(tokens[1].value, "might");
111 EXPECT_EQ(tokens[1].start, 7);
112 EXPECT_EQ(tokens[1].end, 12);
113
114 EXPECT_EQ(tokens[2].value, "be");
115 EXPECT_EQ(tokens[2].start, 13);
116 EXPECT_EQ(tokens[2].end, 15);
117
118 EXPECT_EQ(tokens[3].value, "čomplíčateď");
119 EXPECT_EQ(tokens[3].start, 16);
120 EXPECT_EQ(tokens[3].end, 27);
121
122 EXPECT_EQ(tokens[4].value, "?");
123 EXPECT_EQ(tokens[4].start, 27);
124 EXPECT_EQ(tokens[4].end, 28);
125
126 EXPECT_EQ(tokens[5].value, "!");
127 EXPECT_EQ(tokens[5].start, 28);
128 EXPECT_EQ(tokens[5].end, 29);
129
130 EXPECT_EQ(tokens[6].value, ":");
131 EXPECT_EQ(tokens[6].start, 29);
132 EXPECT_EQ(tokens[6].end, 30);
133
134 EXPECT_EQ(tokens[7].value, "Oder");
135 EXPECT_EQ(tokens[7].start, 31);
136 EXPECT_EQ(tokens[7].end, 35);
137
138 EXPECT_EQ(tokens[8].value, "?");
139 EXPECT_EQ(tokens[8].start, 35);
140 EXPECT_EQ(tokens[8].end, 36);
141 }
142
TEST(TokenizerUtilTest,SimpleEnglishWithPunctuation)143 TEST(TokenizerUtilTest, SimpleEnglishWithPunctuation) {
144 absl::string_view input = "I am fine, thanks!";
145
146 std::vector<Token> tokens =
147 TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
148
149 EXPECT_THAT(tokens, testing::ElementsAreArray(
150 {Token{"I", 0, 1}, Token{"am", 2, 4},
151 Token{"fine", 5, 9}, Token{",", 9, 10},
152 Token{"thanks", 11, 17}, Token{"!", 17, 18}}));
153 }
154
TEST(TokenizerUtilTest,InputDoesNotEndWithDelimiter)155 TEST(TokenizerUtilTest, InputDoesNotEndWithDelimiter) {
156 absl::string_view input = "Good! Cool";
157
158 std::vector<Token> tokens =
159 TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
160
161 EXPECT_THAT(tokens,
162 testing::ElementsAreArray({Token{"Good", 0, 4}, Token{"!", 4, 5},
163 Token{"Cool", 6, 10}}));
164 }
165
TEST(TokenizerUtilTest,OnlySpace)166 TEST(TokenizerUtilTest, OnlySpace) {
167 absl::string_view input = " \t";
168
169 std::vector<Token> tokens =
170 TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
171
172 ASSERT_TRUE(tokens.empty());
173 }
174
TEST(TokenizerUtilTest,Punctuation)175 TEST(TokenizerUtilTest, Punctuation) {
176 absl::string_view input = "!-/:-@[-`{-~";
177
178 std::vector<Token> tokens =
179 TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
180
181 EXPECT_THAT(tokens,
182 testing::ElementsAreArray(
183 {Token{"!", 0, 1}, Token{"-", 1, 2}, Token{"/", 2, 3},
184 Token{":", 3, 4}, Token{"-", 4, 5}, Token{"@", 5, 6},
185 Token{"[", 6, 7}, Token{"-", 7, 8}, Token{"`", 8, 9},
186 Token{"{", 9, 10}, Token{"-", 10, 11}, Token{"~", 11, 12}}));
187 }
188
TEST(TokenizerUtilTest,ChineseCharacters)189 TEST(TokenizerUtilTest, ChineseCharacters) {
190 absl::string_view input = "你好嗎三個字";
191
192 std::vector<Token> tokens =
193 TokenizeOnWhiteSpacePunctuationAndChineseLetter(input);
194
195 EXPECT_THAT(tokens,
196 testing::ElementsAreArray(
197 {Token{"你", 0, 1}, Token{"好", 1, 2}, Token{"嗎", 2, 3},
198 Token{"三", 3, 4}, Token{"個", 4, 5}, Token{"字", 5, 6}}));
199 }
200 } // namespace
201 } // namespace libtextclassifier3
202