1 // Copyright 2015 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <algorithm>
6 #include <utility>
7 #include <vector>
8
9 #include "build/build_config.h"
10 #include "core/fxge/fx_font.h"
11 #include "public/cpp/fpdf_scopers.h"
12 #include "public/fpdf_doc.h"
13 #include "public/fpdf_text.h"
14 #include "public/fpdf_transformpage.h"
15 #include "public/fpdfview.h"
16 #include "testing/embedder_test.h"
17 #include "testing/fx_string_testhelpers.h"
18 #include "testing/gtest/include/gtest/gtest.h"
19
20 namespace {
21
22 constexpr char kHelloGoodbyeText[] = "Hello, world!\r\nGoodbye, world!";
23 constexpr int kHelloGoodbyeTextSize = std::size(kHelloGoodbyeText);
24
check_unsigned_shorts(const char * expected,const unsigned short * actual,size_t length)25 bool check_unsigned_shorts(const char* expected,
26 const unsigned short* actual,
27 size_t length) {
28 if (length > strlen(expected) + 1)
29 return false;
30
31 for (size_t i = 0; i < length; ++i) {
32 if (actual[i] != static_cast<unsigned short>(expected[i]))
33 return false;
34 }
35 return true;
36 }
37
38 } // namespace
39
40 class FPDFTextEmbedderTest : public EmbedderTest {};
41
TEST_F(FPDFTextEmbedderTest,Text)42 TEST_F(FPDFTextEmbedderTest, Text) {
43 ASSERT_TRUE(OpenDocument("hello_world.pdf"));
44 FPDF_PAGE page = LoadPage(0);
45 ASSERT_TRUE(page);
46
47 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
48 ASSERT_TRUE(textpage);
49
50 unsigned short buffer[128];
51 memset(buffer, 0xbd, sizeof(buffer));
52
53 // Check that edge cases are handled gracefully
54 EXPECT_EQ(0, FPDFText_GetText(textpage, 0, 128, nullptr));
55 EXPECT_EQ(0, FPDFText_GetText(textpage, -1, 128, buffer));
56 EXPECT_EQ(0, FPDFText_GetText(textpage, 0, -1, buffer));
57 EXPECT_EQ(1, FPDFText_GetText(textpage, 0, 0, buffer));
58 EXPECT_EQ(0, buffer[0]);
59
60 // Keep going and check the next case.
61 memset(buffer, 0xbd, sizeof(buffer));
62 EXPECT_EQ(2, FPDFText_GetText(textpage, 0, 1, buffer));
63 EXPECT_EQ(kHelloGoodbyeText[0], buffer[0]);
64 EXPECT_EQ(0, buffer[1]);
65
66 // Check includes the terminating NUL that is provided.
67 int num_chars = FPDFText_GetText(textpage, 0, 128, buffer);
68 ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
69 EXPECT_TRUE(
70 check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize));
71
72 // Count does not include the terminating NUL in the string literal.
73 EXPECT_EQ(kHelloGoodbyeTextSize - 1, FPDFText_CountChars(textpage));
74 for (size_t i = 0; i < kHelloGoodbyeTextSize - 1; ++i) {
75 EXPECT_EQ(static_cast<unsigned int>(kHelloGoodbyeText[i]),
76 FPDFText_GetUnicode(textpage, i))
77 << " at " << i;
78 }
79
80 // Extracting using a buffer that will be completely filled. Small buffer is
81 // 12 elements long, since it will need 2 locations per displayed character in
82 // the expected string, plus 2 more for the terminating character.
83 static const char kSmallExpected[] = "Hello";
84 unsigned short small_buffer[12];
85 memset(buffer, 0xbd, sizeof(buffer));
86 EXPECT_EQ(6, FPDFText_GetText(textpage, 0, 5, small_buffer));
87 EXPECT_TRUE(check_unsigned_shorts(kSmallExpected, small_buffer,
88 sizeof(kSmallExpected)));
89
90 EXPECT_EQ(12.0, FPDFText_GetFontSize(textpage, 0));
91 EXPECT_EQ(16.0, FPDFText_GetFontSize(textpage, 15));
92
93 double left = 1.0;
94 double right = 2.0;
95 double bottom = 3.0;
96 double top = 4.0;
97 EXPECT_FALSE(FPDFText_GetCharBox(nullptr, 4, &left, &right, &bottom, &top));
98 EXPECT_DOUBLE_EQ(1.0, left);
99 EXPECT_DOUBLE_EQ(2.0, right);
100 EXPECT_DOUBLE_EQ(3.0, bottom);
101 EXPECT_DOUBLE_EQ(4.0, top);
102 EXPECT_FALSE(FPDFText_GetCharBox(textpage, -1, &left, &right, &bottom, &top));
103 EXPECT_DOUBLE_EQ(1.0, left);
104 EXPECT_DOUBLE_EQ(2.0, right);
105 EXPECT_DOUBLE_EQ(3.0, bottom);
106 EXPECT_DOUBLE_EQ(4.0, top);
107 EXPECT_FALSE(FPDFText_GetCharBox(textpage, 55, &left, &right, &bottom, &top));
108 EXPECT_DOUBLE_EQ(1.0, left);
109 EXPECT_DOUBLE_EQ(2.0, right);
110 EXPECT_DOUBLE_EQ(3.0, bottom);
111 EXPECT_DOUBLE_EQ(4.0, top);
112 EXPECT_FALSE(
113 FPDFText_GetCharBox(textpage, 4, nullptr, &right, &bottom, &top));
114 EXPECT_FALSE(FPDFText_GetCharBox(textpage, 4, &left, nullptr, &bottom, &top));
115 EXPECT_FALSE(FPDFText_GetCharBox(textpage, 4, &left, &right, nullptr, &top));
116 EXPECT_FALSE(
117 FPDFText_GetCharBox(textpage, 4, &left, &right, &bottom, nullptr));
118 EXPECT_FALSE(
119 FPDFText_GetCharBox(textpage, 4, nullptr, nullptr, nullptr, nullptr));
120
121 EXPECT_TRUE(FPDFText_GetCharBox(textpage, 4, &left, &right, &bottom, &top));
122 EXPECT_NEAR(41.120, left, 0.001);
123 EXPECT_NEAR(46.208, right, 0.001);
124 EXPECT_NEAR(49.892, bottom, 0.001);
125 EXPECT_NEAR(55.652, top, 0.001);
126
127 FS_RECTF rect = {4.0f, 1.0f, 3.0f, 2.0f};
128 EXPECT_FALSE(FPDFText_GetLooseCharBox(nullptr, 4, &rect));
129 EXPECT_FLOAT_EQ(4.0f, rect.left);
130 EXPECT_FLOAT_EQ(3.0f, rect.right);
131 EXPECT_FLOAT_EQ(2.0f, rect.bottom);
132 EXPECT_FLOAT_EQ(1.0f, rect.top);
133 EXPECT_FALSE(FPDFText_GetLooseCharBox(textpage, -1, &rect));
134 EXPECT_FLOAT_EQ(4.0f, rect.left);
135 EXPECT_FLOAT_EQ(3.0f, rect.right);
136 EXPECT_FLOAT_EQ(2.0f, rect.bottom);
137 EXPECT_FLOAT_EQ(1.0f, rect.top);
138 EXPECT_FALSE(FPDFText_GetLooseCharBox(textpage, 55, &rect));
139 EXPECT_FLOAT_EQ(4.0f, rect.left);
140 EXPECT_FLOAT_EQ(3.0f, rect.right);
141 EXPECT_FLOAT_EQ(2.0f, rect.bottom);
142 EXPECT_FLOAT_EQ(1.0f, rect.top);
143 EXPECT_FALSE(FPDFText_GetLooseCharBox(textpage, 4, nullptr));
144
145 EXPECT_TRUE(FPDFText_GetLooseCharBox(textpage, 4, &rect));
146 EXPECT_FLOAT_EQ(40.664001f, rect.left);
147 EXPECT_FLOAT_EQ(46.664001f, rect.right);
148 EXPECT_FLOAT_EQ(47.667271f, rect.bottom);
149 EXPECT_FLOAT_EQ(59.667271f, rect.top);
150
151 double x = 0.0;
152 double y = 0.0;
153 EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 4, &x, &y));
154 EXPECT_NEAR(40.664, x, 0.001);
155 EXPECT_NEAR(50.000, y, 0.001);
156
157 EXPECT_EQ(4, FPDFText_GetCharIndexAtPos(textpage, 42.0, 50.0, 1.0, 1.0));
158 EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 0.0, 0.0, 1.0, 1.0));
159 EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 199.0, 199.0, 1.0, 1.0));
160
161 // Test out of range indicies.
162 EXPECT_EQ(-1,
163 FPDFText_GetCharIndexAtPos(textpage, 42.0, 10000000.0, 1.0, 1.0));
164 EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, -1.0, 50.0, 1.0, 1.0));
165
166 // Count does not include the terminating NUL in the string literal.
167 EXPECT_EQ(2, FPDFText_CountRects(textpage, 0, kHelloGoodbyeTextSize - 1));
168
169 left = 0.0;
170 right = 0.0;
171 bottom = 0.0;
172 top = 0.0;
173 EXPECT_TRUE(FPDFText_GetRect(textpage, 1, &left, &top, &right, &bottom));
174 EXPECT_NEAR(20.800, left, 0.001);
175 EXPECT_NEAR(135.040, right, 0.001);
176 EXPECT_NEAR(96.688, bottom, 0.001);
177 EXPECT_NEAR(111.600, top, 0.001);
178
179 // Test out of range indicies set outputs to (0.0, 0.0, 0.0, 0.0).
180 left = -1.0;
181 right = -1.0;
182 bottom = -1.0;
183 top = -1.0;
184 EXPECT_FALSE(FPDFText_GetRect(textpage, -1, &left, &top, &right, &bottom));
185 EXPECT_EQ(0.0, left);
186 EXPECT_EQ(0.0, right);
187 EXPECT_EQ(0.0, bottom);
188 EXPECT_EQ(0.0, top);
189
190 left = -2.0;
191 right = -2.0;
192 bottom = -2.0;
193 top = -2.0;
194 EXPECT_FALSE(FPDFText_GetRect(textpage, 2, &left, &top, &right, &bottom));
195 EXPECT_EQ(0.0, left);
196 EXPECT_EQ(0.0, right);
197 EXPECT_EQ(0.0, bottom);
198 EXPECT_EQ(0.0, top);
199
200 EXPECT_EQ(
201 9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, nullptr, 0));
202
203 // Extract starting at character 4 as above.
204 memset(buffer, 0xbd, sizeof(buffer));
205 EXPECT_EQ(
206 1, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, buffer, 1));
207 EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText + 4, buffer, 1));
208 EXPECT_EQ(0xbdbd, buffer[1]);
209
210 memset(buffer, 0xbd, sizeof(buffer));
211 EXPECT_EQ(
212 9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, buffer, 9));
213 EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText + 4, buffer, 8));
214 EXPECT_EQ(0xbdbd, buffer[9]);
215
216 memset(buffer, 0xbd, sizeof(buffer));
217 EXPECT_EQ(10, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
218 buffer, 128));
219 EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText + 4, buffer, 9));
220 EXPECT_EQ(0u, buffer[9]);
221 EXPECT_EQ(0xbdbd, buffer[10]);
222
223 FPDFText_ClosePage(textpage);
224 UnloadPage(page);
225 }
226
TEST_F(FPDFTextEmbedderTest,TextVertical)227 TEST_F(FPDFTextEmbedderTest, TextVertical) {
228 ASSERT_TRUE(OpenDocument("vertical_text.pdf"));
229 FPDF_PAGE page = LoadPage(0);
230 ASSERT_TRUE(page);
231
232 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
233 ASSERT_TRUE(textpage);
234
235 EXPECT_EQ(12.0, FPDFText_GetFontSize(textpage, 0));
236
237 double x = 0.0;
238 double y = 0.0;
239 EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 1, &x, &y));
240 EXPECT_NEAR(6.664, x, 0.001);
241 EXPECT_NEAR(171.508, y, 0.001);
242
243 EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 2, &x, &y));
244 EXPECT_NEAR(8.668, x, 0.001);
245 EXPECT_NEAR(160.492, y, 0.001);
246
247 FS_RECTF rect;
248 EXPECT_TRUE(FPDFText_GetLooseCharBox(textpage, 1, &rect));
249 EXPECT_NEAR(4, rect.left, 0.001);
250 EXPECT_NEAR(16, rect.right, 0.001);
251 EXPECT_NEAR(178.984, rect.bottom, 0.001);
252 EXPECT_NEAR(170.308, rect.top, 0.001);
253
254 EXPECT_TRUE(FPDFText_GetLooseCharBox(textpage, 2, &rect));
255 EXPECT_NEAR(4, rect.left, 0.001);
256 EXPECT_NEAR(16, rect.right, 0.001);
257 EXPECT_NEAR(170.308, rect.bottom, 0.001);
258 EXPECT_NEAR(159.292, rect.top, 0.001);
259
260 FPDFText_ClosePage(textpage);
261 UnloadPage(page);
262 }
263
TEST_F(FPDFTextEmbedderTest,TextHebrewMirrored)264 TEST_F(FPDFTextEmbedderTest, TextHebrewMirrored) {
265 ASSERT_TRUE(OpenDocument("hebrew_mirrored.pdf"));
266 FPDF_PAGE page = LoadPage(0);
267 ASSERT_TRUE(page);
268
269 {
270 ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
271 ASSERT_TRUE(textpage);
272
273 constexpr int kCharCount = 10;
274 ASSERT_EQ(kCharCount, FPDFText_CountChars(textpage.get()));
275
276 unsigned short buffer[kCharCount + 1];
277 memset(buffer, 0x42, sizeof(buffer));
278 EXPECT_EQ(kCharCount + 1,
279 FPDFText_GetText(textpage.get(), 0, kCharCount, buffer));
280 EXPECT_EQ(0x05d1, buffer[0]);
281 EXPECT_EQ(0x05e0, buffer[1]);
282 EXPECT_EQ(0x05d9, buffer[2]);
283 EXPECT_EQ(0x05de, buffer[3]);
284 EXPECT_EQ(0x05d9, buffer[4]);
285 EXPECT_EQ(0x05df, buffer[5]);
286 EXPECT_EQ(0x000d, buffer[6]);
287 EXPECT_EQ(0x000a, buffer[7]);
288 EXPECT_EQ(0x05df, buffer[8]);
289 EXPECT_EQ(0x05d1, buffer[9]);
290 }
291
292 UnloadPage(page);
293 }
294
TEST_F(FPDFTextEmbedderTest,TextSearch)295 TEST_F(FPDFTextEmbedderTest, TextSearch) {
296 ASSERT_TRUE(OpenDocument("hello_world.pdf"));
297 FPDF_PAGE page = LoadPage(0);
298 ASSERT_TRUE(page);
299
300 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
301 ASSERT_TRUE(textpage);
302
303 ScopedFPDFWideString nope = GetFPDFWideString(L"nope");
304 ScopedFPDFWideString world = GetFPDFWideString(L"world");
305 ScopedFPDFWideString world_caps = GetFPDFWideString(L"WORLD");
306 ScopedFPDFWideString world_substr = GetFPDFWideString(L"orld");
307
308 {
309 // No occurrences of "nope" in test page.
310 ScopedFPDFTextFind search(FPDFText_FindStart(textpage, nope.get(), 0, 0));
311 EXPECT_TRUE(search);
312 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
313 EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
314
315 // Advancing finds nothing.
316 EXPECT_FALSE(FPDFText_FindNext(search.get()));
317 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
318 EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
319
320 // Retreating finds nothing.
321 EXPECT_FALSE(FPDFText_FindPrev(search.get()));
322 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
323 EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
324 }
325
326 {
327 // Two occurrences of "world" in test page.
328 ScopedFPDFTextFind search(FPDFText_FindStart(textpage, world.get(), 0, 2));
329 EXPECT_TRUE(search);
330
331 // Remains not found until advanced.
332 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
333 EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
334
335 // First occurrence of "world" in this test page.
336 EXPECT_TRUE(FPDFText_FindNext(search.get()));
337 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
338 EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
339
340 // Last occurrence of "world" in this test page.
341 EXPECT_TRUE(FPDFText_FindNext(search.get()));
342 EXPECT_EQ(24, FPDFText_GetSchResultIndex(search.get()));
343 EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
344
345 // Found position unchanged when fails to advance.
346 EXPECT_FALSE(FPDFText_FindNext(search.get()));
347 EXPECT_EQ(24, FPDFText_GetSchResultIndex(search.get()));
348 EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
349
350 // Back to first occurrence.
351 EXPECT_TRUE(FPDFText_FindPrev(search.get()));
352 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
353 EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
354
355 // Found position unchanged when fails to retreat.
356 EXPECT_FALSE(FPDFText_FindPrev(search.get()));
357 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
358 EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
359 }
360
361 {
362 // Exact search unaffected by case sensitiity and whole word flags.
363 ScopedFPDFTextFind search(FPDFText_FindStart(
364 textpage, world.get(), FPDF_MATCHCASE | FPDF_MATCHWHOLEWORD, 0));
365 EXPECT_TRUE(search);
366 EXPECT_TRUE(FPDFText_FindNext(search.get()));
367 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
368 EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
369 }
370
371 {
372 // Default is case-insensitive, so matching agaist caps works.
373 ScopedFPDFTextFind search(
374 FPDFText_FindStart(textpage, world_caps.get(), 0, 0));
375 EXPECT_TRUE(search);
376 EXPECT_TRUE(FPDFText_FindNext(search.get()));
377 EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
378 EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
379 }
380
381 {
382 // But can be made case sensitive, in which case this fails.
383 ScopedFPDFTextFind search(
384 FPDFText_FindStart(textpage, world_caps.get(), FPDF_MATCHCASE, 0));
385 EXPECT_FALSE(FPDFText_FindNext(search.get()));
386 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
387 EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
388 }
389
390 {
391 // Default is match anywhere within word, so matching substring works.
392 ScopedFPDFTextFind search(
393 FPDFText_FindStart(textpage, world_substr.get(), 0, 0));
394 EXPECT_TRUE(FPDFText_FindNext(search.get()));
395 EXPECT_EQ(8, FPDFText_GetSchResultIndex(search.get()));
396 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
397 }
398
399 {
400 // But can be made to mach word boundaries, in which case this fails.
401 ScopedFPDFTextFind search(FPDFText_FindStart(textpage, world_substr.get(),
402 FPDF_MATCHWHOLEWORD, 0));
403 EXPECT_FALSE(FPDFText_FindNext(search.get()));
404 // TODO(tsepez): investigate strange index/count values in this state.
405 }
406
407 FPDFText_ClosePage(textpage);
408 UnloadPage(page);
409 }
410
TEST_F(FPDFTextEmbedderTest,TextSearchConsecutive)411 TEST_F(FPDFTextEmbedderTest, TextSearchConsecutive) {
412 ASSERT_TRUE(OpenDocument("find_text_consecutive.pdf"));
413 FPDF_PAGE page = LoadPage(0);
414 ASSERT_TRUE(page);
415
416 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
417 ASSERT_TRUE(textpage);
418
419 ScopedFPDFWideString aaaa = GetFPDFWideString(L"aaaa");
420
421 {
422 // Search for "aaaa" yields 2 results in "aaaaaaaaaa".
423 ScopedFPDFTextFind search(FPDFText_FindStart(textpage, aaaa.get(), 0, 0));
424 EXPECT_TRUE(search);
425
426 // Remains not found until advanced.
427 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
428 EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
429
430 // First occurrence of "aaaa" in this test page.
431 EXPECT_TRUE(FPDFText_FindNext(search.get()));
432 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
433 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
434
435 // Last occurrence of "aaaa" in this test page.
436 EXPECT_TRUE(FPDFText_FindNext(search.get()));
437 EXPECT_EQ(4, FPDFText_GetSchResultIndex(search.get()));
438 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
439
440 // Found position unchanged when fails to advance.
441 EXPECT_FALSE(FPDFText_FindNext(search.get()));
442 EXPECT_EQ(4, FPDFText_GetSchResultIndex(search.get()));
443 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
444
445 // Back to first occurrence.
446 EXPECT_TRUE(FPDFText_FindPrev(search.get()));
447 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
448 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
449
450 // Found position unchanged when fails to retreat.
451 EXPECT_FALSE(FPDFText_FindPrev(search.get()));
452 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
453 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
454 }
455
456 {
457 // Search for "aaaa" yields 7 results in "aaaaaaaaaa", when searching with
458 // FPDF_CONSECUTIVE.
459 ScopedFPDFTextFind search(
460 FPDFText_FindStart(textpage, aaaa.get(), FPDF_CONSECUTIVE, 0));
461 EXPECT_TRUE(search);
462
463 // Remains not found until advanced.
464 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
465 EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
466
467 // Find consecutive occurrences of "aaaa" in this test page:
468 for (int i = 0; i < 7; ++i) {
469 EXPECT_TRUE(FPDFText_FindNext(search.get()));
470 EXPECT_EQ(i, FPDFText_GetSchResultIndex(search.get()));
471 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
472 }
473
474 // Found position unchanged when fails to advance.
475 EXPECT_FALSE(FPDFText_FindNext(search.get()));
476 EXPECT_EQ(6, FPDFText_GetSchResultIndex(search.get()));
477 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
478
479 for (int i = 5; i >= 0; --i) {
480 EXPECT_TRUE(FPDFText_FindPrev(search.get()));
481 EXPECT_EQ(i, FPDFText_GetSchResultIndex(search.get()));
482 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
483 }
484
485 // Found position unchanged when fails to retreat.
486 EXPECT_FALSE(FPDFText_FindPrev(search.get()));
487 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
488 EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
489 }
490
491 FPDFText_ClosePage(textpage);
492 UnloadPage(page);
493 }
494
495 // Fails on Windows. https://crbug.com/pdfium/1370
496 #if BUILDFLAG(IS_WIN)
497 #define MAYBE_TextSearchLatinExtended DISABLED_TextSearchLatinExtended
498 #else
499 #define MAYBE_TextSearchLatinExtended TextSearchLatinExtended
500 #endif
TEST_F(FPDFTextEmbedderTest,MAYBE_TextSearchLatinExtended)501 TEST_F(FPDFTextEmbedderTest, MAYBE_TextSearchLatinExtended) {
502 ASSERT_TRUE(OpenDocument("latin_extended.pdf"));
503 FPDF_PAGE page = LoadPage(0);
504 ASSERT_TRUE(page);
505
506 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
507 ASSERT_TRUE(textpage);
508
509 // Upper/lowercase 'a' with breve.
510 constexpr FPDF_WCHAR kNeedleUpper[] = {0x0102, 0x0000};
511 constexpr FPDF_WCHAR kNeedleLower[] = {0x0103, 0x0000};
512
513 for (const auto* needle : {kNeedleUpper, kNeedleLower}) {
514 ScopedFPDFTextFind search(FPDFText_FindStart(textpage, needle, 0, 0));
515 EXPECT_TRUE(search);
516 EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
517 EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
518
519 // Should find 2 results at position 21/22, both with length 1.
520 EXPECT_TRUE(FPDFText_FindNext(search.get()));
521 EXPECT_EQ(2, FPDFText_GetSchResultIndex(search.get()));
522 EXPECT_EQ(1, FPDFText_GetSchCount(search.get()));
523 EXPECT_TRUE(FPDFText_FindNext(search.get()));
524 EXPECT_EQ(3, FPDFText_GetSchResultIndex(search.get()));
525 EXPECT_EQ(1, FPDFText_GetSchCount(search.get()));
526 // And no more than 2 results.
527 EXPECT_FALSE(FPDFText_FindNext(search.get()));
528 }
529
530 FPDFText_ClosePage(textpage);
531 UnloadPage(page);
532 }
533
534 // Test that the page has characters despite a bad stream length.
TEST_F(FPDFTextEmbedderTest,StreamLengthPastEndOfFile)535 TEST_F(FPDFTextEmbedderTest, StreamLengthPastEndOfFile) {
536 ASSERT_TRUE(OpenDocument("bug_57.pdf"));
537 FPDF_PAGE page = LoadPage(0);
538 ASSERT_TRUE(page);
539
540 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
541 ASSERT_TRUE(textpage);
542 EXPECT_EQ(13, FPDFText_CountChars(textpage));
543
544 FPDFText_ClosePage(textpage);
545 UnloadPage(page);
546 }
547
TEST_F(FPDFTextEmbedderTest,WebLinks)548 TEST_F(FPDFTextEmbedderTest, WebLinks) {
549 ASSERT_TRUE(OpenDocument("weblinks.pdf"));
550 FPDF_PAGE page = LoadPage(0);
551 ASSERT_TRUE(page);
552
553 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
554 ASSERT_TRUE(textpage);
555
556 {
557 ScopedFPDFPageLink pagelink(FPDFLink_LoadWebLinks(textpage));
558 EXPECT_TRUE(pagelink);
559
560 // Page contains two HTTP-style URLs.
561 EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink.get()));
562
563 // Only a terminating NUL required for bogus links.
564 EXPECT_EQ(1, FPDFLink_GetURL(pagelink.get(), 2, nullptr, 0));
565 EXPECT_EQ(1, FPDFLink_GetURL(pagelink.get(), 1400, nullptr, 0));
566 EXPECT_EQ(1, FPDFLink_GetURL(pagelink.get(), -1, nullptr, 0));
567 }
568
569 FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
570 EXPECT_TRUE(pagelink);
571
572 // Query the number of characters required for each link (incl NUL).
573 EXPECT_EQ(25, FPDFLink_GetURL(pagelink, 0, nullptr, 0));
574 EXPECT_EQ(26, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
575
576 static const char expected_url[] = "http://example.com?q=foo";
577 static const size_t expected_len = sizeof(expected_url);
578 unsigned short buffer[128];
579
580 // Retrieve a link with too small a buffer. Buffer will not be
581 // NUL-terminated, but must not be modified past indicated length,
582 // so pre-fill with a pattern to check write bounds.
583 memset(buffer, 0xbd, sizeof(buffer));
584 EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 0, buffer, 1));
585 EXPECT_TRUE(check_unsigned_shorts(expected_url, buffer, 1));
586 EXPECT_EQ(0xbdbd, buffer[1]);
587
588 // Check buffer that doesn't have space for a terminating NUL.
589 memset(buffer, 0xbd, sizeof(buffer));
590 EXPECT_EQ(static_cast<int>(expected_len - 1),
591 FPDFLink_GetURL(pagelink, 0, buffer, expected_len - 1));
592 EXPECT_TRUE(check_unsigned_shorts(expected_url, buffer, expected_len - 1));
593 EXPECT_EQ(0xbdbd, buffer[expected_len - 1]);
594
595 // Retreive link with exactly-sized buffer.
596 memset(buffer, 0xbd, sizeof(buffer));
597 EXPECT_EQ(static_cast<int>(expected_len),
598 FPDFLink_GetURL(pagelink, 0, buffer, expected_len));
599 EXPECT_TRUE(check_unsigned_shorts(expected_url, buffer, expected_len));
600 EXPECT_EQ(0u, buffer[expected_len - 1]);
601 EXPECT_EQ(0xbdbd, buffer[expected_len]);
602
603 // Retreive link with ample-sized-buffer.
604 memset(buffer, 0xbd, sizeof(buffer));
605 EXPECT_EQ(static_cast<int>(expected_len),
606 FPDFLink_GetURL(pagelink, 0, buffer, 128));
607 EXPECT_TRUE(check_unsigned_shorts(expected_url, buffer, expected_len));
608 EXPECT_EQ(0u, buffer[expected_len - 1]);
609 EXPECT_EQ(0xbdbd, buffer[expected_len]);
610
611 // Each link rendered in a single rect in this test page.
612 EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 0));
613 EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 1));
614
615 // Each link rendered in a single rect in this test page.
616 EXPECT_EQ(0, FPDFLink_CountRects(pagelink, -1));
617 EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 2));
618 EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 10000));
619
620 // Check boundary of valid link index with valid rect index.
621 double left = 0.0;
622 double right = 0.0;
623 double top = 0.0;
624 double bottom = 0.0;
625 EXPECT_TRUE(FPDFLink_GetRect(pagelink, 0, 0, &left, &top, &right, &bottom));
626 EXPECT_NEAR(50.828, left, 0.001);
627 EXPECT_NEAR(187.904, right, 0.001);
628 EXPECT_NEAR(97.516, bottom, 0.001);
629 EXPECT_NEAR(108.700, top, 0.001);
630
631 // Check that valid link with invalid rect index leaves parameters unchanged.
632 left = -1.0;
633 right = -1.0;
634 top = -1.0;
635 bottom = -1.0;
636 EXPECT_FALSE(FPDFLink_GetRect(pagelink, 0, 1, &left, &top, &right, &bottom));
637 EXPECT_EQ(-1.0, left);
638 EXPECT_EQ(-1.0, right);
639 EXPECT_EQ(-1.0, bottom);
640 EXPECT_EQ(-1.0, top);
641
642 // Check that invalid link index leaves parameters unchanged.
643 left = -2.0;
644 right = -2.0;
645 top = -2.0;
646 bottom = -2.0;
647 EXPECT_FALSE(FPDFLink_GetRect(pagelink, -1, 0, &left, &top, &right, &bottom));
648 EXPECT_EQ(-2.0, left);
649 EXPECT_EQ(-2.0, right);
650 EXPECT_EQ(-2.0, bottom);
651 EXPECT_EQ(-2.0, top);
652
653 FPDFLink_CloseWebLinks(pagelink);
654 FPDFText_ClosePage(textpage);
655 UnloadPage(page);
656 }
657
TEST_F(FPDFTextEmbedderTest,WebLinksAcrossLines)658 TEST_F(FPDFTextEmbedderTest, WebLinksAcrossLines) {
659 ASSERT_TRUE(OpenDocument("weblinks_across_lines.pdf"));
660 FPDF_PAGE page = LoadPage(0);
661 ASSERT_TRUE(page);
662
663 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
664 ASSERT_TRUE(textpage);
665
666 FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
667 EXPECT_TRUE(pagelink);
668
669 static const char* const kExpectedUrls[] = {
670 "http://example.com", // from "http://www.example.com?\r\nfoo"
671 "http://example.com/", // from "http://www.example.com/\r\nfoo"
672 "http://example.com/test-foo", // from "http://example.com/test-\r\nfoo"
673 "http://abc.com/test-foo", // from "http://abc.com/test-\r\n\r\nfoo"
674 // Next two links from "http://www.example.com/\r\nhttp://www.abc.com/"
675 "http://example.com/",
676 "http://www.abc.com",
677 };
678 static const int kNumLinks = static_cast<int>(std::size(kExpectedUrls));
679
680 EXPECT_EQ(kNumLinks, FPDFLink_CountWebLinks(pagelink));
681
682 unsigned short buffer[128];
683 for (int i = 0; i < kNumLinks; i++) {
684 const size_t expected_len = strlen(kExpectedUrls[i]) + 1;
685 memset(buffer, 0, sizeof(buffer));
686 EXPECT_EQ(static_cast<int>(expected_len),
687 FPDFLink_GetURL(pagelink, i, nullptr, 0));
688 EXPECT_EQ(static_cast<int>(expected_len),
689 FPDFLink_GetURL(pagelink, i, buffer, std::size(buffer)));
690 EXPECT_TRUE(check_unsigned_shorts(kExpectedUrls[i], buffer, expected_len));
691 }
692
693 FPDFLink_CloseWebLinks(pagelink);
694 FPDFText_ClosePage(textpage);
695 UnloadPage(page);
696 }
697
TEST_F(FPDFTextEmbedderTest,WebLinksAcrossLinesBug)698 TEST_F(FPDFTextEmbedderTest, WebLinksAcrossLinesBug) {
699 ASSERT_TRUE(OpenDocument("bug_650.pdf"));
700 FPDF_PAGE page = LoadPage(0);
701 ASSERT_TRUE(page);
702
703 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
704 ASSERT_TRUE(textpage);
705
706 FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
707 EXPECT_TRUE(pagelink);
708
709 EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
710 unsigned short buffer[128] = {0};
711 static const char kExpectedUrl[] =
712 "http://tutorial45.com/learn-autocad-basics-day-166/";
713 static const int kUrlSize = static_cast<int>(sizeof(kExpectedUrl));
714
715 EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
716 EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, buffer, std::size(buffer)));
717 EXPECT_TRUE(check_unsigned_shorts(kExpectedUrl, buffer, kUrlSize));
718
719 FPDFLink_CloseWebLinks(pagelink);
720 FPDFText_ClosePage(textpage);
721 UnloadPage(page);
722 }
723
TEST_F(FPDFTextEmbedderTest,WebLinksCharRanges)724 TEST_F(FPDFTextEmbedderTest, WebLinksCharRanges) {
725 ASSERT_TRUE(OpenDocument("weblinks.pdf"));
726 FPDF_PAGE page = LoadPage(0);
727 ASSERT_TRUE(page);
728
729 FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
730 ASSERT_TRUE(text_page);
731
732 FPDF_PAGELINK page_link = FPDFLink_LoadWebLinks(text_page);
733 EXPECT_TRUE(page_link);
734
735 // Test for char indices of a valid link
736 int start_char_index;
737 int char_count;
738 ASSERT_TRUE(
739 FPDFLink_GetTextRange(page_link, 0, &start_char_index, &char_count));
740 EXPECT_EQ(35, start_char_index);
741 EXPECT_EQ(24, char_count);
742
743 // Test for char indices of an invalid link
744 start_char_index = -10;
745 char_count = -8;
746 ASSERT_FALSE(
747 FPDFLink_GetTextRange(page_link, 6, &start_char_index, &char_count));
748 EXPECT_EQ(start_char_index, -10);
749 EXPECT_EQ(char_count, -8);
750
751 // Test for pagelink = nullptr
752 start_char_index = -10;
753 char_count = -8;
754 ASSERT_FALSE(
755 FPDFLink_GetTextRange(nullptr, 0, &start_char_index, &char_count));
756 EXPECT_EQ(start_char_index, -10);
757 EXPECT_EQ(char_count, -8);
758
759 // Test for link_index < 0
760 start_char_index = -10;
761 char_count = -8;
762 ASSERT_FALSE(
763 FPDFLink_GetTextRange(page_link, -4, &start_char_index, &char_count));
764 EXPECT_EQ(start_char_index, -10);
765 EXPECT_EQ(char_count, -8);
766
767 FPDFLink_CloseWebLinks(page_link);
768 FPDFText_ClosePage(text_page);
769 UnloadPage(page);
770 }
771
TEST_F(FPDFTextEmbedderTest,AnnotLinks)772 TEST_F(FPDFTextEmbedderTest, AnnotLinks) {
773 ASSERT_TRUE(OpenDocument("annots.pdf"));
774 FPDF_PAGE page = LoadPage(0);
775 ASSERT_TRUE(page);
776
777 // Get link count via checking annotation subtype
778 int annot_count = FPDFPage_GetAnnotCount(page);
779 ASSERT_EQ(9, annot_count);
780 int annot_subtype_link_count = 0;
781 for (int i = 0; i < annot_count; ++i) {
782 ScopedFPDFAnnotation annot(FPDFPage_GetAnnot(page, i));
783 if (FPDFAnnot_GetSubtype(annot.get()) == FPDF_ANNOT_LINK) {
784 ++annot_subtype_link_count;
785 }
786 }
787 EXPECT_EQ(4, annot_subtype_link_count);
788
789 // Validate that FPDFLink_Enumerate() returns same number of links
790 int start_pos = 0;
791 FPDF_LINK link_annot;
792 int link_count = 0;
793 while (FPDFLink_Enumerate(page, &start_pos, &link_annot)) {
794 ASSERT_TRUE(link_annot);
795 if (start_pos == 1 || start_pos == 2) {
796 // First two links point to first and second page within the document
797 // respectively
798 FPDF_DEST link_dest = FPDFLink_GetDest(document(), link_annot);
799 EXPECT_TRUE(link_dest);
800 EXPECT_EQ(start_pos - 1,
801 FPDFDest_GetDestPageIndex(document(), link_dest));
802 } else if (start_pos == 3) { // points to PDF Spec URL
803 FS_RECTF link_rect;
804 EXPECT_TRUE(FPDFLink_GetAnnotRect(link_annot, &link_rect));
805 EXPECT_NEAR(66.0, link_rect.left, 0.001);
806 EXPECT_NEAR(544.0, link_rect.top, 0.001);
807 EXPECT_NEAR(196.0, link_rect.right, 0.001);
808 EXPECT_NEAR(529.0, link_rect.bottom, 0.001);
809 } else if (start_pos == 4) { // this link has quad points
810 int quad_point_count = FPDFLink_CountQuadPoints(link_annot);
811 EXPECT_EQ(1, quad_point_count);
812 FS_QUADPOINTSF quad_points;
813 EXPECT_TRUE(FPDFLink_GetQuadPoints(link_annot, 0, &quad_points));
814 EXPECT_NEAR(83.0, quad_points.x1, 0.001);
815 EXPECT_NEAR(453.0, quad_points.y1, 0.001);
816 EXPECT_NEAR(178.0, quad_points.x2, 0.001);
817 EXPECT_NEAR(453.0, quad_points.y2, 0.001);
818 EXPECT_NEAR(83.0, quad_points.x3, 0.001);
819 EXPECT_NEAR(440.0, quad_points.y3, 0.001);
820 EXPECT_NEAR(178.0, quad_points.x4, 0.001);
821 EXPECT_NEAR(440.0, quad_points.y4, 0.001);
822 // AnnotRect is same as quad points for this link
823 FS_RECTF link_rect;
824 EXPECT_TRUE(FPDFLink_GetAnnotRect(link_annot, &link_rect));
825 EXPECT_NEAR(link_rect.left, quad_points.x1, 0.001);
826 EXPECT_NEAR(link_rect.top, quad_points.y1, 0.001);
827 EXPECT_NEAR(link_rect.right, quad_points.x4, 0.001);
828 EXPECT_NEAR(link_rect.bottom, quad_points.y4, 0.001);
829 }
830 ++link_count;
831 }
832 EXPECT_EQ(annot_subtype_link_count, link_count);
833
834 UnloadPage(page);
835 }
836
TEST_F(FPDFTextEmbedderTest,GetFontSize)837 TEST_F(FPDFTextEmbedderTest, GetFontSize) {
838 ASSERT_TRUE(OpenDocument("hello_world.pdf"));
839 FPDF_PAGE page = LoadPage(0);
840 ASSERT_TRUE(page);
841
842 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
843 ASSERT_TRUE(textpage);
844
845 const double kExpectedFontsSizes[] = {12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
846 12, 12, 12, 1, 1, 16, 16, 16, 16, 16,
847 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
848
849 int count = FPDFText_CountChars(textpage);
850 ASSERT_EQ(std::size(kExpectedFontsSizes), static_cast<size_t>(count));
851 for (int i = 0; i < count; ++i)
852 EXPECT_EQ(kExpectedFontsSizes[i], FPDFText_GetFontSize(textpage, i)) << i;
853
854 FPDFText_ClosePage(textpage);
855 UnloadPage(page);
856 }
857
TEST_F(FPDFTextEmbedderTest,GetFontInfo)858 TEST_F(FPDFTextEmbedderTest, GetFontInfo) {
859 ASSERT_TRUE(OpenDocument("hello_world.pdf"));
860 FPDF_PAGE page = LoadPage(0);
861 ASSERT_TRUE(page);
862
863 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
864 ASSERT_TRUE(textpage);
865 std::vector<char> font_name;
866 size_t num_chars1 = strlen("Hello, world!");
867 const char kExpectedFontName1[] = "Times-Roman";
868
869 for (size_t i = 0; i < num_chars1; i++) {
870 int flags = -1;
871 unsigned long length =
872 FPDFText_GetFontInfo(textpage, i, nullptr, 0, &flags);
873 static constexpr unsigned long expected_length = sizeof(kExpectedFontName1);
874 ASSERT_EQ(expected_length, length);
875 EXPECT_EQ(FXFONT_NONSYMBOLIC, flags);
876 font_name.resize(length);
877 std::fill(font_name.begin(), font_name.end(), 'a');
878 flags = -1;
879 EXPECT_EQ(expected_length,
880 FPDFText_GetFontInfo(textpage, i, font_name.data(),
881 font_name.size(), &flags));
882 EXPECT_STREQ(kExpectedFontName1, font_name.data());
883 EXPECT_EQ(FXFONT_NONSYMBOLIC, flags);
884 }
885 // If the size of the buffer is not large enough, the buffer should remain
886 // unchanged.
887 font_name.pop_back();
888 std::fill(font_name.begin(), font_name.end(), 'a');
889 EXPECT_EQ(sizeof(kExpectedFontName1),
890 FPDFText_GetFontInfo(textpage, 0, font_name.data(),
891 font_name.size(), nullptr));
892 for (char a : font_name)
893 EXPECT_EQ('a', a);
894
895 // The text is "Hello, world!\r\nGoodbye, world!", so the next two characters
896 // do not have any font information.
897 EXPECT_EQ(0u, FPDFText_GetFontInfo(textpage, num_chars1, font_name.data(),
898 font_name.size(), nullptr));
899 EXPECT_EQ(0u, FPDFText_GetFontInfo(textpage, num_chars1 + 1, font_name.data(),
900 font_name.size(), nullptr));
901
902 size_t num_chars2 = strlen("Goodbye, world!");
903 const char kExpectedFontName2[] = "Helvetica";
904 for (size_t i = num_chars1 + 2; i < num_chars1 + num_chars2 + 2; i++) {
905 int flags = -1;
906 unsigned long length =
907 FPDFText_GetFontInfo(textpage, i, nullptr, 0, &flags);
908 static constexpr unsigned long expected_length = sizeof(kExpectedFontName2);
909 ASSERT_EQ(expected_length, length);
910 EXPECT_EQ(FXFONT_NONSYMBOLIC, flags);
911 font_name.resize(length);
912 std::fill(font_name.begin(), font_name.end(), 'a');
913 flags = -1;
914 EXPECT_EQ(expected_length,
915 FPDFText_GetFontInfo(textpage, i, font_name.data(),
916 font_name.size(), &flags));
917 EXPECT_STREQ(kExpectedFontName2, font_name.data());
918 EXPECT_EQ(FXFONT_NONSYMBOLIC, flags);
919 }
920
921 // Now try some out of bounds indices and null pointers to make sure we do not
922 // crash.
923 // No textpage.
924 EXPECT_EQ(0u, FPDFText_GetFontInfo(nullptr, 0, font_name.data(),
925 font_name.size(), nullptr));
926 // No buffer.
927 EXPECT_EQ(sizeof(kExpectedFontName1),
928 FPDFText_GetFontInfo(textpage, 0, nullptr, 0, nullptr));
929 // Negative index.
930 EXPECT_EQ(0u, FPDFText_GetFontInfo(textpage, -1, font_name.data(),
931 font_name.size(), nullptr));
932 // Out of bounds index.
933 EXPECT_EQ(0u, FPDFText_GetFontInfo(textpage, 1000, font_name.data(),
934 font_name.size(), nullptr));
935
936 FPDFText_ClosePage(textpage);
937 UnloadPage(page);
938 }
939
TEST_F(FPDFTextEmbedderTest,ToUnicode)940 TEST_F(FPDFTextEmbedderTest, ToUnicode) {
941 ASSERT_TRUE(OpenDocument("bug_583.pdf"));
942 FPDF_PAGE page = LoadPage(0);
943 ASSERT_TRUE(page);
944
945 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
946 ASSERT_TRUE(textpage);
947
948 ASSERT_EQ(1, FPDFText_CountChars(textpage));
949 EXPECT_EQ(0U, FPDFText_GetUnicode(textpage, 0));
950
951 FPDFText_ClosePage(textpage);
952 UnloadPage(page);
953 }
954
TEST_F(FPDFTextEmbedderTest,IsGenerated)955 TEST_F(FPDFTextEmbedderTest, IsGenerated) {
956 ASSERT_TRUE(OpenDocument("hello_world.pdf"));
957 FPDF_PAGE page = LoadPage(0);
958 ASSERT_TRUE(page);
959
960 {
961 ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
962 ASSERT_TRUE(textpage);
963
964 EXPECT_EQ(static_cast<unsigned int>('H'),
965 FPDFText_GetUnicode(textpage.get(), 0));
966 EXPECT_EQ(0, FPDFText_IsGenerated(textpage.get(), 0));
967 EXPECT_EQ(static_cast<unsigned int>(' '),
968 FPDFText_GetUnicode(textpage.get(), 6));
969 EXPECT_EQ(0, FPDFText_IsGenerated(textpage.get(), 6));
970
971 EXPECT_EQ(static_cast<unsigned int>('\r'),
972 FPDFText_GetUnicode(textpage.get(), 13));
973 EXPECT_EQ(1, FPDFText_IsGenerated(textpage.get(), 13));
974 EXPECT_EQ(static_cast<unsigned int>('\n'),
975 FPDFText_GetUnicode(textpage.get(), 14));
976 EXPECT_EQ(1, FPDFText_IsGenerated(textpage.get(), 14));
977
978 EXPECT_EQ(-1, FPDFText_IsGenerated(textpage.get(), -1));
979 EXPECT_EQ(-1, FPDFText_IsGenerated(textpage.get(), kHelloGoodbyeTextSize));
980 EXPECT_EQ(-1, FPDFText_IsGenerated(nullptr, 6));
981 }
982
983 UnloadPage(page);
984 }
985
TEST_F(FPDFTextEmbedderTest,IsInvalidUnicode)986 TEST_F(FPDFTextEmbedderTest, IsInvalidUnicode) {
987 ASSERT_TRUE(OpenDocument("bug_1388_2.pdf"));
988 FPDF_PAGE page = LoadPage(0);
989 ASSERT_TRUE(page);
990
991 {
992 constexpr int kExpectedCharCount = 5;
993 ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
994 ASSERT_TRUE(textpage);
995 EXPECT_EQ(kExpectedCharCount, FPDFText_CountChars(textpage.get()));
996
997 EXPECT_EQ(static_cast<unsigned int>('X'),
998 FPDFText_GetUnicode(textpage.get(), 0));
999 EXPECT_EQ(0, FPDFText_HasUnicodeMapError(textpage.get(), 0));
1000 EXPECT_EQ(static_cast<unsigned int>(' '),
1001 FPDFText_GetUnicode(textpage.get(), 1));
1002 EXPECT_EQ(0, FPDFText_HasUnicodeMapError(textpage.get(), 1));
1003
1004 EXPECT_EQ(31u, FPDFText_GetUnicode(textpage.get(), 2));
1005 EXPECT_EQ(1, FPDFText_HasUnicodeMapError(textpage.get(), 2));
1006
1007 EXPECT_EQ(-1, FPDFText_HasUnicodeMapError(textpage.get(), -1));
1008 EXPECT_EQ(-1,
1009 FPDFText_HasUnicodeMapError(textpage.get(), kExpectedCharCount));
1010 EXPECT_EQ(-1, FPDFText_HasUnicodeMapError(nullptr, 0));
1011 }
1012
1013 UnloadPage(page);
1014 }
1015
TEST_F(FPDFTextEmbedderTest,Bug_921)1016 TEST_F(FPDFTextEmbedderTest, Bug_921) {
1017 ASSERT_TRUE(OpenDocument("bug_921.pdf"));
1018 FPDF_PAGE page = LoadPage(0);
1019 ASSERT_TRUE(page);
1020
1021 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1022 ASSERT_TRUE(textpage);
1023
1024 static constexpr unsigned int kData[] = {
1025 1095, 1077, 1083, 1086, 1074, 1077, 1095, 1077, 1089, 1082, 1086, 1077,
1026 32, 1089, 1090, 1088, 1072, 1076, 1072, 1085, 1080, 1077, 46, 32};
1027 static constexpr int kStartIndex = 238;
1028
1029 ASSERT_EQ(268, FPDFText_CountChars(textpage));
1030 for (size_t i = 0; i < std::size(kData); ++i)
1031 EXPECT_EQ(kData[i], FPDFText_GetUnicode(textpage, kStartIndex + i));
1032
1033 unsigned short buffer[std::size(kData) + 1];
1034 memset(buffer, 0xbd, sizeof(buffer));
1035 int count = FPDFText_GetText(textpage, kStartIndex, std::size(kData), buffer);
1036 ASSERT_GT(count, 0);
1037 ASSERT_EQ(std::size(kData) + 1, static_cast<size_t>(count));
1038 for (size_t i = 0; i < std::size(kData); ++i)
1039 EXPECT_EQ(kData[i], buffer[i]);
1040 EXPECT_EQ(0, buffer[std::size(kData)]);
1041
1042 FPDFText_ClosePage(textpage);
1043 UnloadPage(page);
1044 }
1045
TEST_F(FPDFTextEmbedderTest,GetTextWithHyphen)1046 TEST_F(FPDFTextEmbedderTest, GetTextWithHyphen) {
1047 ASSERT_TRUE(OpenDocument("bug_781804.pdf"));
1048 FPDF_PAGE page = LoadPage(0);
1049 ASSERT_TRUE(page);
1050
1051 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1052 ASSERT_TRUE(textpage);
1053
1054 // Check that soft hyphens are not included
1055 // Expecting 'Veritaserum', except there is a \uFFFE where the hyphen was in
1056 // the original text. This is a weird thing that Adobe does, which we
1057 // replicate.
1058 constexpr unsigned short soft_expected[] = {
1059 0x0056, 0x0065, 0x0072, 0x0069, 0x0074, 0x0061, 0xfffe,
1060 0x0073, 0x0065, 0x0072, 0x0075, 0x006D, 0x0000};
1061 {
1062 constexpr int count = std::size(soft_expected) - 1;
1063 unsigned short buffer[std::size(soft_expected)];
1064 memset(buffer, 0, sizeof(buffer));
1065
1066 EXPECT_EQ(count + 1, FPDFText_GetText(textpage, 0, count, buffer));
1067 for (int i = 0; i < count; i++)
1068 EXPECT_EQ(soft_expected[i], buffer[i]);
1069 }
1070
1071 // Check that hard hyphens are included
1072 {
1073 // There isn't the \0 in the actual doc, but there is a \r\n, so need to
1074 // add 1 to get aligned.
1075 constexpr size_t offset = std::size(soft_expected) + 1;
1076 // Expecting 'User-\r\ngenerated', the - is a unicode character, so cannot
1077 // store in a char[].
1078 constexpr unsigned short hard_expected[] = {
1079 0x0055, 0x0073, 0x0065, 0x0072, 0x2010, 0x000d, 0x000a, 0x0067, 0x0065,
1080 0x006e, 0x0065, 0x0072, 0x0061, 0x0074, 0x0065, 0x0064, 0x0000};
1081 constexpr int count = std::size(hard_expected) - 1;
1082 unsigned short buffer[std::size(hard_expected)];
1083
1084 EXPECT_EQ(count + 1, FPDFText_GetText(textpage, offset, count, buffer));
1085 for (int i = 0; i < count; i++)
1086 EXPECT_EQ(hard_expected[i], buffer[i]);
1087 }
1088
1089 FPDFText_ClosePage(textpage);
1090 UnloadPage(page);
1091 }
1092
TEST_F(FPDFTextEmbedderTest,bug_782596)1093 TEST_F(FPDFTextEmbedderTest, bug_782596) {
1094 // If there is a regression in this test, it will only fail under ASAN
1095 ASSERT_TRUE(OpenDocument("bug_782596.pdf"));
1096 FPDF_PAGE page = LoadPage(0);
1097 ASSERT_TRUE(page);
1098 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1099 ASSERT_TRUE(textpage);
1100 FPDFText_ClosePage(textpage);
1101 UnloadPage(page);
1102 }
1103
TEST_F(FPDFTextEmbedderTest,ControlCharacters)1104 TEST_F(FPDFTextEmbedderTest, ControlCharacters) {
1105 ASSERT_TRUE(OpenDocument("control_characters.pdf"));
1106 FPDF_PAGE page = LoadPage(0);
1107 ASSERT_TRUE(page);
1108
1109 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1110 ASSERT_TRUE(textpage);
1111
1112 // Should not include the control characters in the output
1113 unsigned short buffer[128];
1114 memset(buffer, 0xbd, sizeof(buffer));
1115 int num_chars = FPDFText_GetText(textpage, 0, 128, buffer);
1116 ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
1117 EXPECT_TRUE(
1118 check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize));
1119
1120 // Attempting to get a chunk of text after the control characters
1121 static const char expected_substring[] = "Goodbye, world!";
1122 // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
1123 // original stream
1124 static const int offset = 17;
1125 memset(buffer, 0xbd, sizeof(buffer));
1126 num_chars = FPDFText_GetText(textpage, offset, 128, buffer);
1127
1128 ASSERT_GE(num_chars, 0);
1129 EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
1130 EXPECT_TRUE(check_unsigned_shorts(expected_substring, buffer,
1131 sizeof(expected_substring)));
1132
1133 FPDFText_ClosePage(textpage);
1134 UnloadPage(page);
1135 }
1136
1137 // Testing that hyphen makers (0x0002) are replacing hard hyphens when
1138 // the word contains non-ASCII characters.
TEST_F(FPDFTextEmbedderTest,bug_1029)1139 TEST_F(FPDFTextEmbedderTest, bug_1029) {
1140 ASSERT_TRUE(OpenDocument("bug_1029.pdf"));
1141 FPDF_PAGE page = LoadPage(0);
1142 ASSERT_TRUE(page);
1143
1144 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1145 ASSERT_TRUE(textpage);
1146
1147 constexpr int page_range_offset = 171;
1148 constexpr int page_range_length = 56;
1149
1150 // This text is:
1151 // 'METADATA table. When the split has committed, it noti' followed
1152 // by a 'soft hyphen' (0x0002) and then 'fi'.
1153 //
1154 // The original text has a fi ligature, but that is broken up into
1155 // two characters when the PDF is processed.
1156 constexpr unsigned int expected[] = {
1157 0x004d, 0x0045, 0x0054, 0x0041, 0x0044, 0x0041, 0x0054, 0x0041,
1158 0x0020, 0x0074, 0x0061, 0x0062, 0x006c, 0x0065, 0x002e, 0x0020,
1159 0x0057, 0x0068, 0x0065, 0x006e, 0x0020, 0x0074, 0x0068, 0x0065,
1160 0x0020, 0x0073, 0x0070, 0x006c, 0x0069, 0x0074, 0x0020, 0x0068,
1161 0x0061, 0x0073, 0x0020, 0x0063, 0x006f, 0x006d, 0x006d, 0x0069,
1162 0x0074, 0x0074, 0x0065, 0x0064, 0x002c, 0x0020, 0x0069, 0x0074,
1163 0x0020, 0x006e, 0x006f, 0x0074, 0x0069, 0x0002, 0x0066, 0x0069};
1164 static_assert(page_range_length == std::size(expected),
1165 "Expected should be the same size as the range being "
1166 "extracted from page.");
1167 EXPECT_LT(page_range_offset + page_range_length,
1168 FPDFText_CountChars(textpage));
1169
1170 for (int i = 0; i < page_range_length; ++i) {
1171 EXPECT_EQ(expected[i],
1172 FPDFText_GetUnicode(textpage, page_range_offset + i));
1173 }
1174
1175 FPDFText_ClosePage(textpage);
1176 UnloadPage(page);
1177 }
1178
TEST_F(FPDFTextEmbedderTest,CountRects)1179 TEST_F(FPDFTextEmbedderTest, CountRects) {
1180 ASSERT_TRUE(OpenDocument("hello_world.pdf"));
1181 FPDF_PAGE page = LoadPage(0);
1182 ASSERT_TRUE(page);
1183
1184 FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1185 ASSERT_TRUE(textpage);
1186
1187 // Sanity check hello_world.pdf.
1188 // |num_chars| check includes the terminating NUL that is provided.
1189 {
1190 unsigned short buffer[128];
1191 int num_chars = FPDFText_GetText(textpage, 0, 128, buffer);
1192 ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
1193 EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText, buffer,
1194 kHelloGoodbyeTextSize));
1195 }
1196
1197 // Now test FPDFText_CountRects().
1198 static const int kHelloWorldEnd = strlen("Hello, world!");
1199 static const int kGoodbyeWorldStart = kHelloWorldEnd + 2; // "\r\n"
1200 for (int start = 0; start < kHelloWorldEnd; ++start) {
1201 // Always grab some part of "hello world" and some part of "goodbye world"
1202 // Since -1 means "all".
1203 EXPECT_EQ(2, FPDFText_CountRects(textpage, start, -1));
1204
1205 // No characters always means 0 rects.
1206 EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 0));
1207
1208 // 1 character stays within "hello world"
1209 EXPECT_EQ(1, FPDFText_CountRects(textpage, start, 1));
1210
1211 // When |start| is 0, Having |kGoodbyeWorldStart| char count does not reach
1212 // "goodbye world".
1213 int expected_value = start ? 2 : 1;
1214 EXPECT_EQ(expected_value,
1215 FPDFText_CountRects(textpage, start, kGoodbyeWorldStart));
1216
1217 // Extremely large character count will always return 2 rects because
1218 // |start| starts inside "hello world".
1219 EXPECT_EQ(2, FPDFText_CountRects(textpage, start, 500));
1220 }
1221
1222 // Now test negative counts.
1223 for (int start = 0; start < kHelloWorldEnd; ++start) {
1224 EXPECT_EQ(2, FPDFText_CountRects(textpage, start, -100));
1225 EXPECT_EQ(2, FPDFText_CountRects(textpage, start, -2));
1226 }
1227
1228 // Now test larger start values.
1229 const int kExpectedLength = strlen(kHelloGoodbyeText);
1230 for (int start = kGoodbyeWorldStart + 1; start < kExpectedLength; ++start) {
1231 EXPECT_EQ(1, FPDFText_CountRects(textpage, start, -1));
1232 EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 0));
1233 EXPECT_EQ(1, FPDFText_CountRects(textpage, start, 1));
1234 EXPECT_EQ(1, FPDFText_CountRects(textpage, start, 2));
1235 EXPECT_EQ(1, FPDFText_CountRects(textpage, start, 500));
1236 }
1237
1238 // Now test start values that starts beyond the end of the text.
1239 for (int start = kExpectedLength; start < 100; ++start) {
1240 EXPECT_EQ(0, FPDFText_CountRects(textpage, start, -1));
1241 EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 0));
1242 EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 1));
1243 EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 2));
1244 EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 500));
1245 }
1246
1247 FPDFText_ClosePage(textpage);
1248 UnloadPage(page);
1249 }
1250
TEST_F(FPDFTextEmbedderTest,GetText)1251 TEST_F(FPDFTextEmbedderTest, GetText) {
1252 ASSERT_TRUE(OpenDocument("hello_world.pdf"));
1253 FPDF_PAGE page = LoadPage(0);
1254 ASSERT_TRUE(page);
1255
1256 FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1257 ASSERT_TRUE(text_page);
1258
1259 EXPECT_EQ(2, FPDFPage_CountObjects(page));
1260 FPDF_PAGEOBJECT text_object = FPDFPage_GetObject(page, 0);
1261 ASSERT_TRUE(text_object);
1262
1263 // Positive testing.
1264 constexpr char kHelloText[] = "Hello, world!";
1265 // Return value includes the terminating NUL that is provided.
1266 constexpr unsigned long kHelloUTF16Size = std::size(kHelloText) * 2;
1267 constexpr wchar_t kHelloWideText[] = L"Hello, world!";
1268 unsigned long size = FPDFTextObj_GetText(text_object, text_page, nullptr, 0);
1269 ASSERT_EQ(kHelloUTF16Size, size);
1270
1271 std::vector<unsigned short> buffer(size);
1272 ASSERT_EQ(size,
1273 FPDFTextObj_GetText(text_object, text_page, buffer.data(), size));
1274 ASSERT_EQ(kHelloWideText, GetPlatformWString(buffer.data()));
1275
1276 // Negative testing.
1277 ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, text_page, nullptr, 0));
1278 ASSERT_EQ(0U, FPDFTextObj_GetText(text_object, nullptr, nullptr, 0));
1279 ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, nullptr, nullptr, 0));
1280
1281 // Buffer is too small, ensure it's not modified.
1282 buffer.resize(2);
1283 buffer[0] = 'x';
1284 buffer[1] = '\0';
1285 size =
1286 FPDFTextObj_GetText(text_object, text_page, buffer.data(), buffer.size());
1287 ASSERT_EQ(kHelloUTF16Size, size);
1288 ASSERT_EQ('x', buffer[0]);
1289 ASSERT_EQ('\0', buffer[1]);
1290
1291 FPDFText_ClosePage(text_page);
1292 UnloadPage(page);
1293 }
1294
TEST_F(FPDFTextEmbedderTest,CroppedText)1295 TEST_F(FPDFTextEmbedderTest, CroppedText) {
1296 static constexpr int kPageCount = 4;
1297 static constexpr FS_RECTF kBoxes[kPageCount] = {
1298 {50.0f, 150.0f, 150.0f, 50.0f},
1299 {50.0f, 150.0f, 150.0f, 50.0f},
1300 {60.0f, 150.0f, 150.0f, 60.0f},
1301 {60.0f, 150.0f, 150.0f, 60.0f},
1302 };
1303 static constexpr const char* kExpectedText[kPageCount] = {
1304 " world!\r\ndbye, world!",
1305 " world!\r\ndbye, world!",
1306 "bye, world!",
1307 "bye, world!",
1308 };
1309
1310 ASSERT_TRUE(OpenDocument("cropped_text.pdf"));
1311 ASSERT_EQ(kPageCount, FPDF_GetPageCount(document()));
1312
1313 for (int i = 0; i < kPageCount; ++i) {
1314 FPDF_PAGE page = LoadPage(i);
1315 ASSERT_TRUE(page);
1316
1317 FS_RECTF box;
1318 EXPECT_TRUE(FPDF_GetPageBoundingBox(page, &box));
1319 EXPECT_EQ(kBoxes[i].left, box.left);
1320 EXPECT_EQ(kBoxes[i].top, box.top);
1321 EXPECT_EQ(kBoxes[i].right, box.right);
1322 EXPECT_EQ(kBoxes[i].bottom, box.bottom);
1323
1324 {
1325 ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
1326 ASSERT_TRUE(textpage);
1327
1328 unsigned short buffer[128];
1329 memset(buffer, 0xbd, sizeof(buffer));
1330 int num_chars = FPDFText_GetText(textpage.get(), 0, 128, buffer);
1331 ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
1332 EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText, buffer,
1333 kHelloGoodbyeTextSize));
1334
1335 int expected_char_count = strlen(kExpectedText[i]);
1336 ASSERT_EQ(expected_char_count,
1337 FPDFText_GetBoundedText(textpage.get(), box.left, box.top,
1338 box.right, box.bottom, nullptr, 0));
1339
1340 memset(buffer, 0xbd, sizeof(buffer));
1341 ASSERT_EQ(expected_char_count + 1,
1342 FPDFText_GetBoundedText(textpage.get(), box.left, box.top,
1343 box.right, box.bottom, buffer, 128));
1344 EXPECT_TRUE(
1345 check_unsigned_shorts(kExpectedText[i], buffer, expected_char_count));
1346 }
1347
1348 UnloadPage(page);
1349 }
1350 }
1351
TEST_F(FPDFTextEmbedderTest,Bug_1139)1352 TEST_F(FPDFTextEmbedderTest, Bug_1139) {
1353 ASSERT_TRUE(OpenDocument("bug_1139.pdf"));
1354 FPDF_PAGE page = LoadPage(0);
1355 ASSERT_TRUE(page);
1356
1357 FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1358 ASSERT_TRUE(text_page);
1359
1360 // -1 for CountChars not including the \0, but +1 for the extra control
1361 // character.
1362 EXPECT_EQ(kHelloGoodbyeTextSize, FPDFText_CountChars(text_page));
1363
1364 // There is an extra control character at the beginning of the string, but it
1365 // should not appear in the output nor prevent extracting the text.
1366 unsigned short buffer[128];
1367 int num_chars = FPDFText_GetText(text_page, 0, 128, buffer);
1368 ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
1369 EXPECT_TRUE(
1370 check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize));
1371 FPDFText_ClosePage(text_page);
1372 UnloadPage(page);
1373 }
1374
TEST_F(FPDFTextEmbedderTest,Bug_642)1375 TEST_F(FPDFTextEmbedderTest, Bug_642) {
1376 ASSERT_TRUE(OpenDocument("bug_642.pdf"));
1377 FPDF_PAGE page = LoadPage(0);
1378 ASSERT_TRUE(page);
1379 {
1380 ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1381 ASSERT_TRUE(text_page);
1382
1383 constexpr char kText[] = "ABCD";
1384 constexpr size_t kTextSize = std::size(kText);
1385 // -1 for CountChars not including the \0
1386 EXPECT_EQ(static_cast<int>(kTextSize) - 1,
1387 FPDFText_CountChars(text_page.get()));
1388
1389 unsigned short buffer[kTextSize];
1390 int num_chars =
1391 FPDFText_GetText(text_page.get(), 0, std::size(buffer) - 1, buffer);
1392 ASSERT_EQ(static_cast<int>(kTextSize), num_chars);
1393 EXPECT_TRUE(check_unsigned_shorts(kText, buffer, kTextSize));
1394 }
1395
1396 UnloadPage(page);
1397 }
1398
TEST_F(FPDFTextEmbedderTest,GetCharAngle)1399 TEST_F(FPDFTextEmbedderTest, GetCharAngle) {
1400 ASSERT_TRUE(OpenDocument("rotated_text.pdf"));
1401 FPDF_PAGE page = LoadPage(0);
1402 ASSERT_TRUE(page);
1403
1404 FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1405 ASSERT_TRUE(text_page);
1406
1407 static constexpr int kSubstringsSize[] = {
1408 std::size("Hello,"), std::size(" world!\r\n"), std::size("Goodbye,")};
1409
1410 // -1 for CountChars not including the \0, but +1 for the extra control
1411 // character.
1412 EXPECT_EQ(kHelloGoodbyeTextSize, FPDFText_CountChars(text_page));
1413
1414 EXPECT_FLOAT_EQ(-1.0f, FPDFText_GetCharAngle(nullptr, 0));
1415 EXPECT_FLOAT_EQ(-1.0f, FPDFText_GetCharAngle(text_page, -1));
1416 EXPECT_FLOAT_EQ(-1.0f,
1417 FPDFText_GetCharAngle(text_page, kHelloGoodbyeTextSize + 1));
1418
1419 // Test GetCharAngle for every quadrant
1420 EXPECT_NEAR(FXSYS_PI / 4.0, FPDFText_GetCharAngle(text_page, 0), 0.001);
1421 EXPECT_NEAR(3 * FXSYS_PI / 4.0,
1422 FPDFText_GetCharAngle(text_page, kSubstringsSize[0]), 0.001);
1423 EXPECT_NEAR(
1424 5 * FXSYS_PI / 4.0,
1425 FPDFText_GetCharAngle(text_page, kSubstringsSize[0] + kSubstringsSize[1]),
1426 0.001);
1427 EXPECT_NEAR(
1428 7 * FXSYS_PI / 4.0,
1429 FPDFText_GetCharAngle(text_page, kSubstringsSize[0] + kSubstringsSize[1] +
1430 kSubstringsSize[2]),
1431 0.001);
1432
1433 FPDFText_ClosePage(text_page);
1434 UnloadPage(page);
1435 }
1436
TEST_F(FPDFTextEmbedderTest,GetFontWeight)1437 TEST_F(FPDFTextEmbedderTest, GetFontWeight) {
1438 ASSERT_TRUE(OpenDocument("font_weight.pdf"));
1439 FPDF_PAGE page = LoadPage(0);
1440 ASSERT_TRUE(page);
1441
1442 FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1443 ASSERT_TRUE(text_page);
1444
1445 EXPECT_EQ(2, FPDFText_CountChars(text_page));
1446
1447 EXPECT_EQ(-1, FPDFText_GetFontWeight(nullptr, 0));
1448 EXPECT_EQ(-1, FPDFText_GetFontWeight(text_page, -1));
1449 EXPECT_EQ(-1, FPDFText_GetFontWeight(text_page, 314));
1450
1451 // The font used for this text only specifies /StemV (80); the weight value
1452 // that is returned should be calculated from that (80*5 == 400).
1453 EXPECT_EQ(400, FPDFText_GetFontWeight(text_page, 0));
1454
1455 // Using a /StemV value of 82, the estimate comes out to 410, even though
1456 // /FontWeight is 400.
1457 // TODO(crbug.com/pdfium/1420): Fix this the return value here.
1458 EXPECT_EQ(410, FPDFText_GetFontWeight(text_page, 1));
1459
1460 FPDFText_ClosePage(text_page);
1461 UnloadPage(page);
1462 }
1463
TEST_F(FPDFTextEmbedderTest,GetTextRenderMode)1464 TEST_F(FPDFTextEmbedderTest, GetTextRenderMode) {
1465 ASSERT_TRUE(OpenDocument("text_render_mode.pdf"));
1466 FPDF_PAGE page = LoadPage(0);
1467 ASSERT_TRUE(page);
1468
1469 FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1470 ASSERT_TRUE(text_page);
1471
1472 ASSERT_EQ(12, FPDFText_CountChars(text_page));
1473
1474 ASSERT_EQ(FPDF_TEXTRENDERMODE_UNKNOWN,
1475 FPDFText_GetTextRenderMode(nullptr, 0));
1476 ASSERT_EQ(FPDF_TEXTRENDERMODE_UNKNOWN,
1477 FPDFText_GetTextRenderMode(text_page, -1));
1478 ASSERT_EQ(FPDF_TEXTRENDERMODE_UNKNOWN,
1479 FPDFText_GetTextRenderMode(text_page, 314));
1480
1481 ASSERT_EQ(FPDF_TEXTRENDERMODE_FILL, FPDFText_GetTextRenderMode(text_page, 0));
1482
1483 ASSERT_EQ(FPDF_TEXTRENDERMODE_STROKE,
1484 FPDFText_GetTextRenderMode(text_page, 7));
1485
1486 FPDFText_ClosePage(text_page);
1487 UnloadPage(page);
1488 }
1489
TEST_F(FPDFTextEmbedderTest,GetFillColor)1490 TEST_F(FPDFTextEmbedderTest, GetFillColor) {
1491 ASSERT_TRUE(OpenDocument("text_color.pdf"));
1492 FPDF_PAGE page = LoadPage(0);
1493 ASSERT_TRUE(page);
1494
1495 FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1496 ASSERT_TRUE(text_page);
1497
1498 ASSERT_EQ(1, FPDFText_CountChars(text_page));
1499
1500 ASSERT_FALSE(
1501 FPDFText_GetFillColor(nullptr, 0, nullptr, nullptr, nullptr, nullptr));
1502 ASSERT_FALSE(
1503 FPDFText_GetFillColor(text_page, -1, nullptr, nullptr, nullptr, nullptr));
1504 ASSERT_FALSE(FPDFText_GetFillColor(text_page, 314, nullptr, nullptr, nullptr,
1505 nullptr));
1506 ASSERT_FALSE(
1507 FPDFText_GetFillColor(text_page, 0, nullptr, nullptr, nullptr, nullptr));
1508
1509 unsigned int r;
1510 unsigned int g;
1511 unsigned int b;
1512 unsigned int a;
1513 ASSERT_TRUE(FPDFText_GetFillColor(text_page, 0, &r, &g, &b, &a));
1514 ASSERT_EQ(0xffu, r);
1515 ASSERT_EQ(0u, g);
1516 ASSERT_EQ(0u, b);
1517 ASSERT_EQ(0xffu, a);
1518
1519 FPDFText_ClosePage(text_page);
1520 UnloadPage(page);
1521 }
1522
TEST_F(FPDFTextEmbedderTest,GetStrokeColor)1523 TEST_F(FPDFTextEmbedderTest, GetStrokeColor) {
1524 ASSERT_TRUE(OpenDocument("text_color.pdf"));
1525 FPDF_PAGE page = LoadPage(0);
1526 ASSERT_TRUE(page);
1527
1528 FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1529 ASSERT_TRUE(text_page);
1530
1531 ASSERT_EQ(1, FPDFText_CountChars(text_page));
1532
1533 ASSERT_FALSE(
1534 FPDFText_GetStrokeColor(nullptr, 0, nullptr, nullptr, nullptr, nullptr));
1535 ASSERT_FALSE(FPDFText_GetStrokeColor(text_page, -1, nullptr, nullptr, nullptr,
1536 nullptr));
1537 ASSERT_FALSE(FPDFText_GetStrokeColor(text_page, 314, nullptr, nullptr,
1538 nullptr, nullptr));
1539 ASSERT_FALSE(FPDFText_GetStrokeColor(text_page, 0, nullptr, nullptr, nullptr,
1540 nullptr));
1541
1542 unsigned int r;
1543 unsigned int g;
1544 unsigned int b;
1545 unsigned int a;
1546 ASSERT_TRUE(FPDFText_GetStrokeColor(text_page, 0, &r, &g, &b, &a));
1547 ASSERT_EQ(0u, r);
1548 ASSERT_EQ(0xffu, g);
1549 ASSERT_EQ(0u, b);
1550 ASSERT_EQ(0xffu, a);
1551
1552 FPDFText_ClosePage(text_page);
1553 UnloadPage(page);
1554 }
1555
TEST_F(FPDFTextEmbedderTest,GetMatrix)1556 TEST_F(FPDFTextEmbedderTest, GetMatrix) {
1557 constexpr char kExpectedText[] = "A1\r\nA2\r\nA3";
1558 constexpr size_t kExpectedTextSize = std::size(kExpectedText);
1559 constexpr FS_MATRIX kExpectedMatrices[] = {
1560 {12.0f, 0.0f, 0.0f, 10.0f, 66.0f, 90.0f},
1561 {12.0f, 0.0f, 0.0f, 10.0f, 66.0f, 90.0f},
1562 {1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f},
1563 {1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f},
1564 {12.0f, 0.0f, 0.0f, 10.0f, 38.0f, 60.0f},
1565 {12.0f, 0.0f, 0.0f, 10.0f, 38.0f, 60.0f},
1566 {1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f},
1567 {1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f},
1568 {1.0f, 0.0f, 0.0f, 0.833333, 60.0f, 130.0f},
1569 {1.0f, 0.0f, 0.0f, 0.833333, 60.0f, 130.0f},
1570 };
1571 constexpr size_t kExpectedCount = std::size(kExpectedMatrices);
1572 static_assert(kExpectedCount + 1 == kExpectedTextSize,
1573 "Bad expected matrix size");
1574
1575 ASSERT_TRUE(OpenDocument("font_matrix.pdf"));
1576 FPDF_PAGE page = LoadPage(0);
1577 ASSERT_TRUE(page);
1578
1579 {
1580 ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1581 ASSERT_TRUE(text_page);
1582 ASSERT_EQ(static_cast<int>(kExpectedCount),
1583 FPDFText_CountChars(text_page.get()));
1584
1585 {
1586 // Check the characters.
1587 unsigned short buffer[kExpectedTextSize];
1588 ASSERT_EQ(static_cast<int>(kExpectedTextSize),
1589 FPDFText_GetText(text_page.get(), 0, kExpectedCount, buffer));
1590 EXPECT_TRUE(
1591 check_unsigned_shorts(kExpectedText, buffer, kExpectedTextSize));
1592 }
1593
1594 // Check the character matrix.
1595 FS_MATRIX matrix;
1596 for (size_t i = 0; i < kExpectedCount; ++i) {
1597 ASSERT_TRUE(FPDFText_GetMatrix(text_page.get(), i, &matrix)) << i;
1598 EXPECT_FLOAT_EQ(kExpectedMatrices[i].a, matrix.a) << i;
1599 EXPECT_FLOAT_EQ(kExpectedMatrices[i].b, matrix.b) << i;
1600 EXPECT_FLOAT_EQ(kExpectedMatrices[i].c, matrix.c) << i;
1601 EXPECT_FLOAT_EQ(kExpectedMatrices[i].d, matrix.d) << i;
1602 EXPECT_FLOAT_EQ(kExpectedMatrices[i].e, matrix.e) << i;
1603 EXPECT_FLOAT_EQ(kExpectedMatrices[i].f, matrix.f) << i;
1604 }
1605
1606 // Check bad parameters.
1607 EXPECT_FALSE(FPDFText_GetMatrix(nullptr, 0, &matrix));
1608 EXPECT_FALSE(FPDFText_GetMatrix(text_page.get(), 10, &matrix));
1609 EXPECT_FALSE(FPDFText_GetMatrix(text_page.get(), -1, &matrix));
1610 EXPECT_FALSE(FPDFText_GetMatrix(text_page.get(), 0, nullptr));
1611 }
1612
1613 UnloadPage(page);
1614 }
1615
TEST_F(FPDFTextEmbedderTest,CharBox)1616 TEST_F(FPDFTextEmbedderTest, CharBox) {
1617 // For a size 12 letter 'A'.
1618 constexpr double kExpectedCharWidth = 8.460;
1619 constexpr double kExpectedCharHeight = 6.600;
1620 constexpr float kExpectedLooseCharWidth = 8.664f;
1621 constexpr float kExpectedLooseCharHeight = 12.0f;
1622
1623 ASSERT_TRUE(OpenDocument("font_matrix.pdf"));
1624 FPDF_PAGE page = LoadPage(0);
1625 ASSERT_TRUE(page);
1626
1627 {
1628 ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1629 ASSERT_TRUE(text_page);
1630
1631 // Check the character box size.
1632 double left;
1633 double right;
1634 double bottom;
1635 double top;
1636 ASSERT_TRUE(
1637 FPDFText_GetCharBox(text_page.get(), 0, &left, &right, &bottom, &top));
1638 EXPECT_NEAR(kExpectedCharWidth, right - left, 0.001);
1639 EXPECT_NEAR(kExpectedCharHeight, top - bottom, 0.001);
1640 ASSERT_TRUE(
1641 FPDFText_GetCharBox(text_page.get(), 4, &left, &right, &bottom, &top));
1642 EXPECT_NEAR(kExpectedCharWidth, right - left, 0.001);
1643 EXPECT_NEAR(kExpectedCharHeight, top - bottom, 0.001);
1644 ASSERT_TRUE(
1645 FPDFText_GetCharBox(text_page.get(), 8, &left, &right, &bottom, &top));
1646 EXPECT_NEAR(kExpectedCharWidth, right - left, 0.001);
1647 EXPECT_NEAR(kExpectedCharHeight, top - bottom, 0.001);
1648
1649 // Check the loose character box size.
1650 FS_RECTF rect;
1651 ASSERT_TRUE(FPDFText_GetLooseCharBox(text_page.get(), 0, &rect));
1652 EXPECT_FLOAT_EQ(kExpectedLooseCharWidth, rect.right - rect.left);
1653 EXPECT_FLOAT_EQ(kExpectedLooseCharHeight, rect.top - rect.bottom);
1654 ASSERT_TRUE(FPDFText_GetLooseCharBox(text_page.get(), 4, &rect));
1655 EXPECT_FLOAT_EQ(kExpectedLooseCharWidth, rect.right - rect.left);
1656 EXPECT_FLOAT_EQ(kExpectedLooseCharHeight, rect.top - rect.bottom);
1657 ASSERT_TRUE(FPDFText_GetLooseCharBox(text_page.get(), 8, &rect));
1658 EXPECT_FLOAT_EQ(kExpectedLooseCharWidth, rect.right - rect.left);
1659 EXPECT_NEAR(kExpectedLooseCharHeight, rect.top - rect.bottom, 0.00001);
1660 }
1661
1662 UnloadPage(page);
1663 }
1664
TEST_F(FPDFTextEmbedderTest,SmallType3Glyph)1665 TEST_F(FPDFTextEmbedderTest, SmallType3Glyph) {
1666 ASSERT_TRUE(OpenDocument("bug_1591.pdf"));
1667 FPDF_PAGE page = LoadPage(0);
1668 ASSERT_TRUE(page);
1669
1670 {
1671 ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1672 ASSERT_TRUE(text_page);
1673 ASSERT_EQ(5, FPDFText_CountChars(text_page.get()));
1674
1675 EXPECT_EQ(49u, FPDFText_GetUnicode(text_page.get(), 0));
1676 EXPECT_EQ(32u, FPDFText_GetUnicode(text_page.get(), 1));
1677 EXPECT_EQ(50u, FPDFText_GetUnicode(text_page.get(), 2));
1678 EXPECT_EQ(32u, FPDFText_GetUnicode(text_page.get(), 3));
1679 EXPECT_EQ(49u, FPDFText_GetUnicode(text_page.get(), 4));
1680
1681 // Check the character box size.
1682 double left;
1683 double right;
1684 double bottom;
1685 double top;
1686 ASSERT_TRUE(
1687 FPDFText_GetCharBox(text_page.get(), 0, &left, &right, &bottom, &top));
1688 EXPECT_DOUBLE_EQ(63.439998626708984, left);
1689 EXPECT_DOUBLE_EQ(65.360000610351562, right);
1690 EXPECT_DOUBLE_EQ(50.0, bottom);
1691 EXPECT_DOUBLE_EQ(61.520000457763672, top);
1692 ASSERT_TRUE(
1693 FPDFText_GetCharBox(text_page.get(), 1, &left, &right, &bottom, &top));
1694 EXPECT_DOUBLE_EQ(62.007999420166016, left);
1695 EXPECT_DOUBLE_EQ(62.007999420166016, right);
1696 EXPECT_DOUBLE_EQ(50.0, bottom);
1697 EXPECT_DOUBLE_EQ(50.0, top);
1698 ASSERT_TRUE(
1699 FPDFText_GetCharBox(text_page.get(), 2, &left, &right, &bottom, &top));
1700 EXPECT_DOUBLE_EQ(86.0, left);
1701 EXPECT_DOUBLE_EQ(88.400001525878906, right);
1702 EXPECT_DOUBLE_EQ(50.0, bottom);
1703 EXPECT_DOUBLE_EQ(50.240001678466797, top);
1704 ASSERT_TRUE(
1705 FPDFText_GetCharBox(text_page.get(), 3, &left, &right, &bottom, &top));
1706 EXPECT_DOUBLE_EQ(86.010002136230469, left);
1707 EXPECT_DOUBLE_EQ(86.010002136230469, right);
1708 EXPECT_DOUBLE_EQ(50.0, bottom);
1709 EXPECT_DOUBLE_EQ(50.0, top);
1710 ASSERT_TRUE(
1711 FPDFText_GetCharBox(text_page.get(), 4, &left, &right, &bottom, &top));
1712 EXPECT_DOUBLE_EQ(99.44000244140625, left);
1713 EXPECT_DOUBLE_EQ(101.36000061035156, right);
1714 EXPECT_DOUBLE_EQ(50.0, bottom);
1715 EXPECT_DOUBLE_EQ(61.520000457763672, top);
1716 }
1717
1718 UnloadPage(page);
1719 }
1720
TEST_F(FPDFTextEmbedderTest,BigtableTextExtraction)1721 TEST_F(FPDFTextEmbedderTest, BigtableTextExtraction) {
1722 constexpr char kExpectedText[] =
1723 "{fay,jeff,sanjay,wilsonh,kerr,m3b,tushar,\x02k es,gruber}@google.com";
1724 constexpr int kExpectedTextCount = std::size(kExpectedText) - 1;
1725
1726 ASSERT_TRUE(OpenDocument("bigtable_mini.pdf"));
1727 FPDF_PAGE page = LoadPage(0);
1728 ASSERT_TRUE(page);
1729
1730 {
1731 ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1732 ASSERT_TRUE(text_page);
1733 int char_count = FPDFText_CountChars(text_page.get());
1734 ASSERT_GE(char_count, 0);
1735 ASSERT_EQ(kExpectedTextCount, char_count);
1736
1737 for (int i = 0; i < kExpectedTextCount; ++i) {
1738 EXPECT_EQ(static_cast<uint32_t>(kExpectedText[i]),
1739 FPDFText_GetUnicode(text_page.get(), i));
1740 }
1741 }
1742
1743 UnloadPage(page);
1744 }
1745
TEST_F(FPDFTextEmbedderTest,Bug1769)1746 TEST_F(FPDFTextEmbedderTest, Bug1769) {
1747 ASSERT_TRUE(OpenDocument("bug_1769.pdf"));
1748 FPDF_PAGE page = LoadPage(0);
1749 ASSERT_TRUE(page);
1750
1751 {
1752 ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
1753 ASSERT_TRUE(textpage);
1754
1755 unsigned short buffer[128] = {};
1756 // TODO(crbug.com/pdfium/1769): Improve text extraction.
1757 // The first instance of "world" is visible to the human eye and should be
1758 // extracted as is. The second instance is not, so how it should be
1759 // extracted is debatable.
1760 ASSERT_EQ(10, FPDFText_GetText(textpage.get(), 0, 128, buffer));
1761 EXPECT_TRUE(check_unsigned_shorts("wo d wo d", buffer, 10));
1762 }
1763
1764 UnloadPage(page);
1765 }
1766