1 /*
2 * Copyright (C) 2024 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <android-base/file.h>
18 #include <gtest/gtest.h>
19
20 #include <memory>
21 #include <span>
22 #include <string>
23 #include <vector>
24
25 // Goes first due to conflicts.
26 #include "cpp/fpdf_scopers.h"
27 #include "document.h"
28 #include "fpdfview.h"
29 #include "page.h"
30 #include "rect.h"
31
32 namespace {
33
34 using ::pdfClient::Document;
35 using ::pdfClient::Page;
36 using ::pdfClient::Rectangle_i;
37
38 static const std::string kTestdata = "testdata";
39 static const std::string kChineseFile = "chinese.pdf";
40 static const std::string kFrenchFile = "french.pdf";
41 static const std::string kSamplePdfFile = "sample_pdf.pdf";
42 static const std::string kSekretNoPassword = "sekret_no_password.pdf";
43
GetTestDataDir()44 std::string GetTestDataDir() {
45 return android::base::GetExecutableDirectory();
46 }
47
GetTestFile(std::string filename)48 std::string GetTestFile(std::string filename) {
49 return GetTestDataDir() + "/" + kTestdata + "/" + filename;
50 }
51
LoadTestDocument(const std::string filename)52 ScopedFPDFDocument LoadTestDocument(const std::string filename) {
53 return ScopedFPDFDocument(FPDF_LoadDocument(GetTestFile(filename).c_str(), nullptr));
54 }
55
Area(const Rectangle_i & rect)56 int Area(const Rectangle_i& rect) {
57 return rect.Width() * rect.Height();
58 }
59
NumRectsForMatch(std::span<const Rectangle_i> rects,std::span<const int> match_to_rect,int match)60 int NumRectsForMatch(std::span<const Rectangle_i> rects, std::span<const int> match_to_rect,
61 int match) {
62 if (match < 0 || match >= match_to_rect.size()) {
63 return 0;
64 }
65 if (match + 1 == match_to_rect.size()) {
66 return rects.size() - match_to_rect[match];
67 }
68 return match_to_rect[match + 1] - match_to_rect[match];
69 }
70
TEST(Test,SearchPageText_french)71 TEST(Test, SearchPageText_french) {
72 Document doc(LoadTestDocument(kFrenchFile), false);
73 std::shared_ptr<Page> page = doc.GetPage(0);
74
75 const std::string expected_word = "généralement";
76 const std::string wrong_case = "GÉNérALEment";
77 const std::string missing_accents = "GENerALEment";
78 const std::string unexpected_word = "discothèque";
79
80 std::string page_text = page->GetTextUtf8();
81
82 // We can find exact matches in the contents using string::find.
83 EXPECT_NE(std::string::npos, page_text.find(expected_word));
84 // But we can't find it by any of the variations, or the unexpected word.
85 EXPECT_EQ(std::string::npos, page_text.find(wrong_case));
86 EXPECT_EQ(std::string::npos, page_text.find(missing_accents));
87 EXPECT_EQ(std::string::npos, page_text.find(wrong_case));
88
89 // We can find it by any of the variations of it using FindMatchesUtf8.
90 EXPECT_EQ(1, page->FindMatchesUtf8(expected_word, nullptr));
91 EXPECT_EQ(1, page->FindMatchesUtf8(wrong_case, nullptr));
92 EXPECT_EQ(1, page->FindMatchesUtf8(missing_accents, nullptr));
93 // But still can't find a word if it isn't there at all.
94 EXPECT_EQ(0, page->FindMatchesUtf8(unexpected_word, nullptr));
95 }
96
TEST(Test,SearchPageText_chinese)97 TEST(Test, SearchPageText_chinese) {
98 Document doc(LoadTestDocument(kChineseFile), false);
99 std::shared_ptr<Page> page = doc.GetPage(0);
100
101 const std::string chinese_word = "你好";
102 const std::string english_word = "hello";
103 const std::string japanese_word = "先生";
104
105 std::string page_text = page->GetTextUtf8();
106 // Page text should contain the chinese word and the english word.
107 EXPECT_NE(std::string::npos, page_text.find(chinese_word));
108 EXPECT_NE(std::string::npos, page_text.find(english_word));
109 // But not japanese word.
110 EXPECT_EQ(std::string::npos, page_text.find(japanese_word));
111
112 // We can find the chinese word and the latin word.
113 EXPECT_EQ(3, page->FindMatchesUtf8(chinese_word, nullptr));
114 EXPECT_EQ(3, page->FindMatchesUtf8(english_word, nullptr));
115 // But not the japanese word, since it isn't there.
116 EXPECT_EQ(0, page->FindMatchesUtf8(japanese_word, nullptr));
117 }
118
TEST(Test,SearchPageText_hyphens)119 TEST(Test, SearchPageText_hyphens) {
120 Document doc(LoadTestDocument(kSamplePdfFile), false);
121 std::shared_ptr<Page> page = doc.GetPage(0);
122
123 // Punctuation is generally not ignored.
124 // There is one instance of "A. Bbbbb":
125 EXPECT_EQ(1, page->FindMatchesUtf8("A. Bbbb", nullptr));
126 // Cannot find it by searching "C Corbett"
127 EXPECT_EQ(0, page->FindMatchesUtf8("A Bbbbb", nullptr));
128
129 // There are two instances of "wide-area":
130 EXPECT_EQ(1, page->FindMatchesUtf8("wide-area", nullptr));
131 // Cannot find it by searching "widearea".
132 EXPECT_EQ(0, page->FindMatchesUtf8("widearea", nullptr));
133
134 // Support is found 4 times if you find the line broken "sup-/nport":
135 EXPECT_EQ(4, page->FindMatchesUtf8("support", nullptr));
136 // Only the line-broken version can also be found by searching sup-port.
137 EXPECT_EQ(1, page->FindMatchesUtf8("su-pport", nullptr));
138 // Can't find it by adding hyphens in other parts of the word.
139 EXPECT_EQ(0, page->FindMatchesUtf8("s-upport", nullptr));
140 }
141
TEST(Test,GetTextBounds_hyphens)142 TEST(Test, GetTextBounds_hyphens) {
143 Document doc(LoadTestDocument(kSamplePdfFile), false);
144 std::shared_ptr<Page> page = doc.GetPage(0);
145
146 Rectangle_i page_rect = page->Dimensions();
147
148 std::vector<Rectangle_i> gd_bounds;
149 std::vector<int> gd_m2r; // match_to_rect
150 // Finds 2 matches for testing-purpose.
151 EXPECT_EQ(2, page->BoundsOfMatchesUtf8("testing-purpose", &gd_bounds, &gd_m2r, nullptr));
152 EXPECT_EQ(2, gd_m2r.size());
153
154 // But 3 rectangles since one match is broken onto two lines.
155 EXPECT_EQ(3, gd_bounds.size());
156 // The second one is broken onto two lines - this should look like so.
157 EXPECT_EQ(1, NumRectsForMatch(gd_bounds, gd_m2r, 0));
158 EXPECT_EQ(2, NumRectsForMatch(gd_bounds, gd_m2r, 1));
159
160 for (int i = 0; i < 3; i++) {
161 // Any bounds should be of positive area, smaller than the page.
162 EXPECT_GT(Area(gd_bounds[i]), 0);
163 EXPECT_LT(Area(gd_bounds[i]), Area(page_rect));
164 // And it should be entirely on the page:
165 Rectangle_i copy = gd_bounds[i];
166 copy = Intersect(copy, page_rect);
167 EXPECT_EQ(gd_bounds[i], copy);
168 }
169
170 // First match is in a big font the heading, should have the biggest area:
171 for (int i = 1; i < 3; i++) {
172 EXPECT_GT(Area(gd_bounds[0]), Area(gd_bounds[i]));
173 }
174
175 std::vector<Rectangle_i> g_bounds;
176 EXPECT_EQ(2, page->BoundsOfMatchesUtf8("testing-", &g_bounds, nullptr, nullptr));
177 std::vector<Rectangle_i> d_bounds;
178 EXPECT_EQ(2, page->BoundsOfMatchesUtf8("purpose", &d_bounds, nullptr, nullptr));
179
180 // The second testing-purpose is split onto two lines - it should be
181 // made of two rectangles, one which surrounds "testing-" and one which
182 // surrounds "purpose".
183 EXPECT_EQ(g_bounds[1], gd_bounds[1]); // "testing-" rectangle.
184 EXPECT_EQ(d_bounds[1], gd_bounds[2]); // "purpose" rectangle.
185 }
186
TEST(Test,BugSwitzerland)187 TEST(Test, BugSwitzerland) {
188 Document doc(LoadTestDocument(kSekretNoPassword), false);
189 // Opening this text page shouldn't crash - http://b/17684639
190 std::shared_ptr<Page> page = doc.GetPage(0);
191 EXPECT_EQ(2, page->FindMatchesUtf8("very", nullptr));
192 }
193
194 } // namespace
195