xref: /aosp_15_r20/external/libtextclassifier/native/utils/normalization_test.cc (revision 993b0882672172b81d12fad7a7ac0c3e5c824a12)
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/normalization.h"
18 
19 #include <string>
20 
21 #include "utils/base/integral_types.h"
22 #include "utils/utf8/unicodetext.h"
23 #include "utils/utf8/unilib.h"
24 #include "gmock/gmock.h"
25 #include "gtest/gtest.h"
26 
27 namespace libtextclassifier3 {
28 namespace {
29 
30 using testing::Eq;
31 
32 class NormalizationTest : public testing::Test {
33  protected:
NormalizationTest()34   NormalizationTest() : INIT_UNILIB_FOR_TESTING(unilib_) {}
35 
NormalizeTextCodepointWise(const std::string & text,const int32 codepointwise_ops)36   std::string NormalizeTextCodepointWise(const std::string& text,
37                                          const int32 codepointwise_ops) {
38     return libtextclassifier3::NormalizeTextCodepointWise(
39                unilib_, codepointwise_ops,
40                UTF8ToUnicodeText(text, /*do_copy=*/false))
41         .ToUTF8String();
42   }
43 
44   UniLib unilib_;
45 };
46 
TEST_F(NormalizationTest,ReturnsIdenticalStringWhenNoNormalization)47 TEST_F(NormalizationTest, ReturnsIdenticalStringWhenNoNormalization) {
48   EXPECT_THAT(NormalizeTextCodepointWise(
49                   "Never gonna let you down.",
50                   NormalizationOptions_::CodepointwiseNormalizationOp_NONE),
51               Eq("Never gonna let you down."));
52 }
53 
54 #if !defined(TC3_UNILIB_DUMMY)
TEST_F(NormalizationTest,DropsWhitespace)55 TEST_F(NormalizationTest, DropsWhitespace) {
56   EXPECT_THAT(
57       NormalizeTextCodepointWise(
58           "Never gonna let you down.",
59           NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
60       Eq("Nevergonnaletyoudown."));
61   EXPECT_THAT(
62       NormalizeTextCodepointWise(
63           "Never\tgonna\t\tlet\tyou\tdown.",
64           NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
65       Eq("Nevergonnaletyoudown."));
66   EXPECT_THAT(
67       NormalizeTextCodepointWise(
68           "Never\u2003gonna\u2003let\u2003you\u2003down.",
69           NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE),
70       Eq("Nevergonnaletyoudown."));
71 }
72 
TEST_F(NormalizationTest,DropsPunctuation)73 TEST_F(NormalizationTest, DropsPunctuation) {
74   EXPECT_THAT(
75       NormalizeTextCodepointWise(
76           "Never gonna let you down.",
77           NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
78       Eq("Never gonna let you down"));
79   EXPECT_THAT(
80       NormalizeTextCodepointWise(
81           "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
82           NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
83       Eq("αʹ Σημεῖόν ἐστιν οὗ μέρος οὐθέν"));
84   EXPECT_THAT(
85       NormalizeTextCodepointWise(
86           "978—3—16—148410—0",
87           NormalizationOptions_::CodepointwiseNormalizationOp_DROP_PUNCTUATION),
88       Eq("9783161484100"));
89 }
90 
TEST_F(NormalizationTest,LowercasesUnicodeText)91 TEST_F(NormalizationTest, LowercasesUnicodeText) {
92   EXPECT_THAT(
93       NormalizeTextCodepointWise(
94           "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
95           NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE),
96       Eq("αʹ. σημεῖόν ἐστιν, οὗ μέρος οὐθέν."));
97   EXPECT_THAT(
98       NormalizeTextCodepointWise(
99           "αʹ. Σημεῖόν ἐστιν, οὗ μέρος οὐθέν.",
100           NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE |
101               NormalizationOptions_::CodepointwiseNormalizationOp_LOWERCASE),
102       Eq("αʹ.σημεῖόνἐστιν,οὗμέροςοὐθέν."));
103 }
104 
TEST_F(NormalizationTest,UppercasesUnicodeText)105 TEST_F(NormalizationTest, UppercasesUnicodeText) {
106   EXPECT_THAT(
107       NormalizeTextCodepointWise(
108           "Κανένας άνθρωπος δεν ξέρει",
109           NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE),
110       Eq("ΚΑΝΈΝΑΣ ΆΝΘΡΩΠΟΣ ΔΕΝ ΞΈΡΕΙ"));
111   EXPECT_THAT(
112       NormalizeTextCodepointWise(
113           "Κανένας άνθρωπος δεν ξέρει",
114           NormalizationOptions_::CodepointwiseNormalizationOp_DROP_WHITESPACE |
115               NormalizationOptions_::CodepointwiseNormalizationOp_UPPERCASE),
116       Eq("ΚΑΝΈΝΑΣΆΝΘΡΩΠΟΣΔΕΝΞΈΡΕΙ"));
117 }
118 #endif
119 
120 }  // namespace
121 }  // namespace libtextclassifier3
122