1 /*
2  * Copyright (C) 2024 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <android-base/file.h>
18 #include <gtest/gtest.h>
19 
20 #include <memory>
21 #include <span>
22 #include <string>
23 #include <vector>
24 
25 // Goes first due to conflicts.
26 #include "cpp/fpdf_scopers.h"
27 #include "document.h"
28 #include "fpdfview.h"
29 #include "page.h"
30 #include "rect.h"
31 
32 namespace {
33 
34 using ::pdfClient::Document;
35 using ::pdfClient::Page;
36 using ::pdfClient::Rectangle_i;
37 
38 static const std::string kTestdata = "testdata";
39 static const std::string kChineseFile = "chinese.pdf";
40 static const std::string kFrenchFile = "french.pdf";
41 static const std::string kSamplePdfFile = "sample_pdf.pdf";
42 static const std::string kSekretNoPassword = "sekret_no_password.pdf";
43 
GetTestDataDir()44 std::string GetTestDataDir() {
45     return android::base::GetExecutableDirectory();
46 }
47 
GetTestFile(std::string filename)48 std::string GetTestFile(std::string filename) {
49     return GetTestDataDir() + "/" + kTestdata + "/" + filename;
50 }
51 
LoadTestDocument(const std::string filename)52 ScopedFPDFDocument LoadTestDocument(const std::string filename) {
53     return ScopedFPDFDocument(FPDF_LoadDocument(GetTestFile(filename).c_str(), nullptr));
54 }
55 
Area(const Rectangle_i & rect)56 int Area(const Rectangle_i& rect) {
57     return rect.Width() * rect.Height();
58 }
59 
NumRectsForMatch(std::span<const Rectangle_i> rects,std::span<const int> match_to_rect,int match)60 int NumRectsForMatch(std::span<const Rectangle_i> rects, std::span<const int> match_to_rect,
61                      int match) {
62     if (match < 0 || match >= match_to_rect.size()) {
63         return 0;
64     }
65     if (match + 1 == match_to_rect.size()) {
66         return rects.size() - match_to_rect[match];
67     }
68     return match_to_rect[match + 1] - match_to_rect[match];
69 }
70 
TEST(Test,SearchPageText_french)71 TEST(Test, SearchPageText_french) {
72     Document doc(LoadTestDocument(kFrenchFile), false);
73     std::shared_ptr<Page> page = doc.GetPage(0);
74 
75     const std::string expected_word = "généralement";
76     const std::string wrong_case = "GÉNérALEment";
77     const std::string missing_accents = "GENerALEment";
78     const std::string unexpected_word = "discothèque";
79 
80     std::string page_text = page->GetTextUtf8();
81 
82     // We can find exact matches in the contents using string::find.
83     EXPECT_NE(std::string::npos, page_text.find(expected_word));
84     // But we can't find it by any of the variations, or the unexpected word.
85     EXPECT_EQ(std::string::npos, page_text.find(wrong_case));
86     EXPECT_EQ(std::string::npos, page_text.find(missing_accents));
87     EXPECT_EQ(std::string::npos, page_text.find(wrong_case));
88 
89     // We can find it by any of the variations of it using FindMatchesUtf8.
90     EXPECT_EQ(1, page->FindMatchesUtf8(expected_word, nullptr));
91     EXPECT_EQ(1, page->FindMatchesUtf8(wrong_case, nullptr));
92     EXPECT_EQ(1, page->FindMatchesUtf8(missing_accents, nullptr));
93     // But still can't find a word if it isn't there at all.
94     EXPECT_EQ(0, page->FindMatchesUtf8(unexpected_word, nullptr));
95 }
96 
TEST(Test,SearchPageText_chinese)97 TEST(Test, SearchPageText_chinese) {
98     Document doc(LoadTestDocument(kChineseFile), false);
99     std::shared_ptr<Page> page = doc.GetPage(0);
100 
101     const std::string chinese_word = "你好";
102     const std::string english_word = "hello";
103     const std::string japanese_word = "先生";
104 
105     std::string page_text = page->GetTextUtf8();
106     // Page text should contain the chinese word and the english word.
107     EXPECT_NE(std::string::npos, page_text.find(chinese_word));
108     EXPECT_NE(std::string::npos, page_text.find(english_word));
109     // But not japanese word.
110     EXPECT_EQ(std::string::npos, page_text.find(japanese_word));
111 
112     // We can find the chinese word and the latin word.
113     EXPECT_EQ(3, page->FindMatchesUtf8(chinese_word, nullptr));
114     EXPECT_EQ(3, page->FindMatchesUtf8(english_word, nullptr));
115     // But not the japanese word, since it isn't there.
116     EXPECT_EQ(0, page->FindMatchesUtf8(japanese_word, nullptr));
117 }
118 
TEST(Test,SearchPageText_hyphens)119 TEST(Test, SearchPageText_hyphens) {
120     Document doc(LoadTestDocument(kSamplePdfFile), false);
121     std::shared_ptr<Page> page = doc.GetPage(0);
122 
123     // Punctuation is generally not ignored.
124     // There is one instance of "A. Bbbbb":
125     EXPECT_EQ(1, page->FindMatchesUtf8("A. Bbbb", nullptr));
126     // Cannot find it by searching "C Corbett"
127     EXPECT_EQ(0, page->FindMatchesUtf8("A Bbbbb", nullptr));
128 
129     // There are two instances of "wide-area":
130     EXPECT_EQ(1, page->FindMatchesUtf8("wide-area", nullptr));
131     // Cannot find it by searching "widearea".
132     EXPECT_EQ(0, page->FindMatchesUtf8("widearea", nullptr));
133 
134     // Support is found 4 times if you find the line broken "sup-/nport":
135     EXPECT_EQ(4, page->FindMatchesUtf8("support", nullptr));
136     // Only the line-broken version can also be found by searching sup-port.
137     EXPECT_EQ(1, page->FindMatchesUtf8("su-pport", nullptr));
138     // Can't find it by adding hyphens in other parts of the word.
139     EXPECT_EQ(0, page->FindMatchesUtf8("s-upport", nullptr));
140 }
141 
TEST(Test,GetTextBounds_hyphens)142 TEST(Test, GetTextBounds_hyphens) {
143     Document doc(LoadTestDocument(kSamplePdfFile), false);
144     std::shared_ptr<Page> page = doc.GetPage(0);
145 
146     Rectangle_i page_rect = page->Dimensions();
147 
148     std::vector<Rectangle_i> gd_bounds;
149     std::vector<int> gd_m2r;  // match_to_rect
150     // Finds 2 matches for testing-purpose.
151     EXPECT_EQ(2, page->BoundsOfMatchesUtf8("testing-purpose", &gd_bounds, &gd_m2r, nullptr));
152     EXPECT_EQ(2, gd_m2r.size());
153 
154     // But 3 rectangles since one match is broken onto two lines.
155     EXPECT_EQ(3, gd_bounds.size());
156     // The second one is broken onto two lines - this should look like so.
157     EXPECT_EQ(1, NumRectsForMatch(gd_bounds, gd_m2r, 0));
158     EXPECT_EQ(2, NumRectsForMatch(gd_bounds, gd_m2r, 1));
159 
160     for (int i = 0; i < 3; i++) {
161         // Any bounds should be of positive area, smaller than the page.
162         EXPECT_GT(Area(gd_bounds[i]), 0);
163         EXPECT_LT(Area(gd_bounds[i]), Area(page_rect));
164         // And it should be entirely on the page:
165         Rectangle_i copy = gd_bounds[i];
166         copy = Intersect(copy, page_rect);
167         EXPECT_EQ(gd_bounds[i], copy);
168     }
169 
170     // First match is in a big font the heading, should have the biggest area:
171     for (int i = 1; i < 3; i++) {
172         EXPECT_GT(Area(gd_bounds[0]), Area(gd_bounds[i]));
173     }
174 
175     std::vector<Rectangle_i> g_bounds;
176     EXPECT_EQ(2, page->BoundsOfMatchesUtf8("testing-", &g_bounds, nullptr, nullptr));
177     std::vector<Rectangle_i> d_bounds;
178     EXPECT_EQ(2, page->BoundsOfMatchesUtf8("purpose", &d_bounds, nullptr, nullptr));
179 
180     // The second testing-purpose is split onto two lines - it should be
181     // made of two rectangles, one which surrounds "testing-" and one which
182     // surrounds "purpose".
183     EXPECT_EQ(g_bounds[1], gd_bounds[1]);  // "testing-" rectangle.
184     EXPECT_EQ(d_bounds[1], gd_bounds[2]);  // "purpose" rectangle.
185 }
186 
TEST(Test,BugSwitzerland)187 TEST(Test, BugSwitzerland) {
188     Document doc(LoadTestDocument(kSekretNoPassword), false);
189     // Opening this text page shouldn't crash - http://b/17684639
190     std::shared_ptr<Page> page = doc.GetPage(0);
191     EXPECT_EQ(2, page->FindMatchesUtf8("very", nullptr));
192 }
193 
194 }  // namespace
195