1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/normalization.h"
18
19 #include <string>
20
21 #include "utils/base/integral_types.h"
22 #include "utils/utf8/unicodetext.h"
23 #include "utils/utf8/unilib.h"
24 #include "gmock/gmock.h"
25 #include "gtest/gtest.h"
26
27 namespace libtextclassifier3 {
28 namespace {
29
30 using testing::Eq;
31
32 class NormalizationTest : public testing::Test {
33 protected:
NormalizationTest()34 NormalizationTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
35
NormalizeTextCodepointWise(const std::string & text,const int32 codepointwise_ops)36 std::string NormalizeTextCodepointWise(const std::string& text,
37 const int32 codepointwise_ops) {
38 return libtextclassifier3::NormalizeTextCodepointWise(
39 unilib_, codepointwise_ops,
40 UTF8ToUnicodeText(text, /*do_copy=*/false))
41 .ToUTF8String();
42 }
43
44 UniLib unilib_;
45 };
46
TEST_F(NormalizationTest,ReturnsIdenticalStringWhenNoNormalization)47 TEST_F(NormalizationTest, ReturnsIdenticalStringWhenNoNormalization) {
48 EXPECT_THAT(NormalizeTextCodepointWise(
49 "Never gonna let you down.",
50 NormalizationOptions_::CodepointwiseNormalizationOp_NONE),
51 Eq("Never gonna let you down."));
52 }
53
54 #if !defined(TC3_UNILIB_DUMMY)
TEST_F(NormalizationTest,DropsWhitespace)55 TEST_F(NormalizationTest, DropsWhitespace) {
56 EXPECT_THAT(
57 NormalizeTextCodepointWise(
58 "Never gonna let you down.",
59 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
60 Eq("Nevergonnaletyoudown."));
61 EXPECT_THAT(
62 NormalizeTextCodepointWise(
63 "Never\tgonna\t\tlet\tyou\tdown.",
64 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
65 Eq("Nevergonnaletyoudown."));
66 EXPECT_THAT(
67 NormalizeTextCodepointWise(
68 "Never\u2003gonna\u2003let\u2003you\u2003down.",
69 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
70 Eq("Nevergonnaletyoudown."));
71 }
72
TEST_F(NormalizationTest,DropsPunctuation)73 TEST_F(NormalizationTest, DropsPunctuation) {
74 EXPECT_THAT(
75 NormalizeTextCodepointWise(
76 "Never gonna let you down.",
77 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
78 Eq("Never gonna let you down"));
79 EXPECT_THAT(
80 NormalizeTextCodepointWise(
81 "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
82 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
83 Eq("αʹ Σημεῖόν ἐστιν οὗ μέρος οὐθέν"));
84 EXPECT_THAT(
85 NormalizeTextCodepointWise(
86 "978—3—16—148410—0",
87 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
88 Eq("9783161484100"));
89 }
90
TEST_F(NormalizationTest,LowercasesUnicodeText)91 TEST_F(NormalizationTest, LowercasesUnicodeText) {
92 EXPECT_THAT(
93 NormalizeTextCodepointWise(
94 "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
95 NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE),
96 Eq("αʹ. σημεῖόν ἐστιν, οὗ μέρος οὐθέν."));
97 EXPECT_THAT(
98 NormalizeTextCodepointWise(
99 "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
100 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE |
101 NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE),
102 Eq("αʹ.σημεῖόνἐστιν,οὗμέροςοὐθέν."));
103 }
104
TEST_F(NormalizationTest,UppercasesUnicodeText)105 TEST_F(NormalizationTest, UppercasesUnicodeText) {
106 EXPECT_THAT(
107 NormalizeTextCodepointWise(
108 "Κανένας άνθρωπος δεν ξέρει",
109 NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE),
110 Eq("ΚΑΝΈΝΑΣ ΆΝΘΡΩΠΟΣ ΔΕΝ ΞΈΡΕΙ"));
111 EXPECT_THAT(
112 NormalizeTextCodepointWise(
113 "Κανένας άνθρωπος δεν ξέρει",
114 NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE |
115 NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE),
116 Eq("ΚΑΝΈΝΑΣΆΝΘΡΩΠΟΣΔΕΝΞΈΡΕΙ"));
117 }
118 #endif
119
120 } // namespace
121 } // namespace libtextclassifier3
122