xref: /aosp_15_r20/external/pdfium/fpdfsdk/fpdf_text_embeddertest.cpp (revision 3ac0a46f773bac49fa9476ec2b1cf3f8da5ec3a4)
1 // Copyright 2015 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <algorithm>
6 #include <utility>
7 #include <vector>
8 
9 #include "build/build_config.h"
10 #include "core/fxge/fx_font.h"
11 #include "public/cpp/fpdf_scopers.h"
12 #include "public/fpdf_doc.h"
13 #include "public/fpdf_text.h"
14 #include "public/fpdf_transformpage.h"
15 #include "public/fpdfview.h"
16 #include "testing/embedder_test.h"
17 #include "testing/fx_string_testhelpers.h"
18 #include "testing/gtest/include/gtest/gtest.h"
19 
20 namespace {
21 
22 constexpr char kHelloGoodbyeText[] = "Hello, world!\r\nGoodbye, world!";
23 constexpr int kHelloGoodbyeTextSize = std::size(kHelloGoodbyeText);
24 
check_unsigned_shorts(const char * expected,const unsigned short * actual,size_t length)25 bool check_unsigned_shorts(const char* expected,
26                            const unsigned short* actual,
27                            size_t length) {
28   if (length > strlen(expected) + 1)
29     return false;
30 
31   for (size_t i = 0; i < length; ++i) {
32     if (actual[i] != static_cast<unsigned short>(expected[i]))
33       return false;
34   }
35   return true;
36 }
37 
38 }  // namespace
39 
40 class FPDFTextEmbedderTest : public EmbedderTest {};
41 
TEST_F(FPDFTextEmbedderTest,Text)42 TEST_F(FPDFTextEmbedderTest, Text) {
43   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
44   FPDF_PAGE page = LoadPage(0);
45   ASSERT_TRUE(page);
46 
47   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
48   ASSERT_TRUE(textpage);
49 
50   unsigned short buffer[128];
51   memset(buffer, 0xbd, sizeof(buffer));
52 
53   // Check that edge cases are handled gracefully
54   EXPECT_EQ(0, FPDFText_GetText(textpage, 0, 128, nullptr));
55   EXPECT_EQ(0, FPDFText_GetText(textpage, -1, 128, buffer));
56   EXPECT_EQ(0, FPDFText_GetText(textpage, 0, -1, buffer));
57   EXPECT_EQ(1, FPDFText_GetText(textpage, 0, 0, buffer));
58   EXPECT_EQ(0, buffer[0]);
59 
60   // Keep going and check the next case.
61   memset(buffer, 0xbd, sizeof(buffer));
62   EXPECT_EQ(2, FPDFText_GetText(textpage, 0, 1, buffer));
63   EXPECT_EQ(kHelloGoodbyeText[0], buffer[0]);
64   EXPECT_EQ(0, buffer[1]);
65 
66   // Check includes the terminating NUL that is provided.
67   int num_chars = FPDFText_GetText(textpage, 0, 128, buffer);
68   ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
69   EXPECT_TRUE(
70       check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize));
71 
72   // Count does not include the terminating NUL in the string literal.
73   EXPECT_EQ(kHelloGoodbyeTextSize - 1, FPDFText_CountChars(textpage));
74   for (size_t i = 0; i < kHelloGoodbyeTextSize - 1; ++i) {
75     EXPECT_EQ(static_cast<unsigned int>(kHelloGoodbyeText[i]),
76               FPDFText_GetUnicode(textpage, i))
77         << " at " << i;
78   }
79 
80   // Extracting using a buffer that will be completely filled. Small buffer is
81   // 12 elements long, since it will need 2 locations per displayed character in
82   // the expected string, plus 2 more for the terminating character.
83   static const char kSmallExpected[] = "Hello";
84   unsigned short small_buffer[12];
85   memset(buffer, 0xbd, sizeof(buffer));
86   EXPECT_EQ(6, FPDFText_GetText(textpage, 0, 5, small_buffer));
87   EXPECT_TRUE(check_unsigned_shorts(kSmallExpected, small_buffer,
88                                     sizeof(kSmallExpected)));
89 
90   EXPECT_EQ(12.0, FPDFText_GetFontSize(textpage, 0));
91   EXPECT_EQ(16.0, FPDFText_GetFontSize(textpage, 15));
92 
93   double left = 1.0;
94   double right = 2.0;
95   double bottom = 3.0;
96   double top = 4.0;
97   EXPECT_FALSE(FPDFText_GetCharBox(nullptr, 4, &left, &right, &bottom, &top));
98   EXPECT_DOUBLE_EQ(1.0, left);
99   EXPECT_DOUBLE_EQ(2.0, right);
100   EXPECT_DOUBLE_EQ(3.0, bottom);
101   EXPECT_DOUBLE_EQ(4.0, top);
102   EXPECT_FALSE(FPDFText_GetCharBox(textpage, -1, &left, &right, &bottom, &top));
103   EXPECT_DOUBLE_EQ(1.0, left);
104   EXPECT_DOUBLE_EQ(2.0, right);
105   EXPECT_DOUBLE_EQ(3.0, bottom);
106   EXPECT_DOUBLE_EQ(4.0, top);
107   EXPECT_FALSE(FPDFText_GetCharBox(textpage, 55, &left, &right, &bottom, &top));
108   EXPECT_DOUBLE_EQ(1.0, left);
109   EXPECT_DOUBLE_EQ(2.0, right);
110   EXPECT_DOUBLE_EQ(3.0, bottom);
111   EXPECT_DOUBLE_EQ(4.0, top);
112   EXPECT_FALSE(
113       FPDFText_GetCharBox(textpage, 4, nullptr, &right, &bottom, &top));
114   EXPECT_FALSE(FPDFText_GetCharBox(textpage, 4, &left, nullptr, &bottom, &top));
115   EXPECT_FALSE(FPDFText_GetCharBox(textpage, 4, &left, &right, nullptr, &top));
116   EXPECT_FALSE(
117       FPDFText_GetCharBox(textpage, 4, &left, &right, &bottom, nullptr));
118   EXPECT_FALSE(
119       FPDFText_GetCharBox(textpage, 4, nullptr, nullptr, nullptr, nullptr));
120 
121   EXPECT_TRUE(FPDFText_GetCharBox(textpage, 4, &left, &right, &bottom, &top));
122   EXPECT_NEAR(41.120, left, 0.001);
123   EXPECT_NEAR(46.208, right, 0.001);
124   EXPECT_NEAR(49.892, bottom, 0.001);
125   EXPECT_NEAR(55.652, top, 0.001);
126 
127   FS_RECTF rect = {4.0f, 1.0f, 3.0f, 2.0f};
128   EXPECT_FALSE(FPDFText_GetLooseCharBox(nullptr, 4, &rect));
129   EXPECT_FLOAT_EQ(4.0f, rect.left);
130   EXPECT_FLOAT_EQ(3.0f, rect.right);
131   EXPECT_FLOAT_EQ(2.0f, rect.bottom);
132   EXPECT_FLOAT_EQ(1.0f, rect.top);
133   EXPECT_FALSE(FPDFText_GetLooseCharBox(textpage, -1, &rect));
134   EXPECT_FLOAT_EQ(4.0f, rect.left);
135   EXPECT_FLOAT_EQ(3.0f, rect.right);
136   EXPECT_FLOAT_EQ(2.0f, rect.bottom);
137   EXPECT_FLOAT_EQ(1.0f, rect.top);
138   EXPECT_FALSE(FPDFText_GetLooseCharBox(textpage, 55, &rect));
139   EXPECT_FLOAT_EQ(4.0f, rect.left);
140   EXPECT_FLOAT_EQ(3.0f, rect.right);
141   EXPECT_FLOAT_EQ(2.0f, rect.bottom);
142   EXPECT_FLOAT_EQ(1.0f, rect.top);
143   EXPECT_FALSE(FPDFText_GetLooseCharBox(textpage, 4, nullptr));
144 
145   EXPECT_TRUE(FPDFText_GetLooseCharBox(textpage, 4, &rect));
146   EXPECT_FLOAT_EQ(40.664001f, rect.left);
147   EXPECT_FLOAT_EQ(46.664001f, rect.right);
148   EXPECT_FLOAT_EQ(47.667271f, rect.bottom);
149   EXPECT_FLOAT_EQ(59.667271f, rect.top);
150 
151   double x = 0.0;
152   double y = 0.0;
153   EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 4, &x, &y));
154   EXPECT_NEAR(40.664, x, 0.001);
155   EXPECT_NEAR(50.000, y, 0.001);
156 
157   EXPECT_EQ(4, FPDFText_GetCharIndexAtPos(textpage, 42.0, 50.0, 1.0, 1.0));
158   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 0.0, 0.0, 1.0, 1.0));
159   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, 199.0, 199.0, 1.0, 1.0));
160 
161   // Test out of range indicies.
162   EXPECT_EQ(-1,
163             FPDFText_GetCharIndexAtPos(textpage, 42.0, 10000000.0, 1.0, 1.0));
164   EXPECT_EQ(-1, FPDFText_GetCharIndexAtPos(textpage, -1.0, 50.0, 1.0, 1.0));
165 
166   // Count does not include the terminating NUL in the string literal.
167   EXPECT_EQ(2, FPDFText_CountRects(textpage, 0, kHelloGoodbyeTextSize - 1));
168 
169   left = 0.0;
170   right = 0.0;
171   bottom = 0.0;
172   top = 0.0;
173   EXPECT_TRUE(FPDFText_GetRect(textpage, 1, &left, &top, &right, &bottom));
174   EXPECT_NEAR(20.800, left, 0.001);
175   EXPECT_NEAR(135.040, right, 0.001);
176   EXPECT_NEAR(96.688, bottom, 0.001);
177   EXPECT_NEAR(111.600, top, 0.001);
178 
179   // Test out of range indicies set outputs to (0.0, 0.0, 0.0, 0.0).
180   left = -1.0;
181   right = -1.0;
182   bottom = -1.0;
183   top = -1.0;
184   EXPECT_FALSE(FPDFText_GetRect(textpage, -1, &left, &top, &right, &bottom));
185   EXPECT_EQ(0.0, left);
186   EXPECT_EQ(0.0, right);
187   EXPECT_EQ(0.0, bottom);
188   EXPECT_EQ(0.0, top);
189 
190   left = -2.0;
191   right = -2.0;
192   bottom = -2.0;
193   top = -2.0;
194   EXPECT_FALSE(FPDFText_GetRect(textpage, 2, &left, &top, &right, &bottom));
195   EXPECT_EQ(0.0, left);
196   EXPECT_EQ(0.0, right);
197   EXPECT_EQ(0.0, bottom);
198   EXPECT_EQ(0.0, top);
199 
200   EXPECT_EQ(
201       9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, nullptr, 0));
202 
203   // Extract starting at character 4 as above.
204   memset(buffer, 0xbd, sizeof(buffer));
205   EXPECT_EQ(
206       1, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, buffer, 1));
207   EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText + 4, buffer, 1));
208   EXPECT_EQ(0xbdbd, buffer[1]);
209 
210   memset(buffer, 0xbd, sizeof(buffer));
211   EXPECT_EQ(
212       9, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0, buffer, 9));
213   EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText + 4, buffer, 8));
214   EXPECT_EQ(0xbdbd, buffer[9]);
215 
216   memset(buffer, 0xbd, sizeof(buffer));
217   EXPECT_EQ(10, FPDFText_GetBoundedText(textpage, 41.0, 56.0, 82.0, 48.0,
218                                         buffer, 128));
219   EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText + 4, buffer, 9));
220   EXPECT_EQ(0u, buffer[9]);
221   EXPECT_EQ(0xbdbd, buffer[10]);
222 
223   FPDFText_ClosePage(textpage);
224   UnloadPage(page);
225 }
226 
TEST_F(FPDFTextEmbedderTest,TextVertical)227 TEST_F(FPDFTextEmbedderTest, TextVertical) {
228   ASSERT_TRUE(OpenDocument("vertical_text.pdf"));
229   FPDF_PAGE page = LoadPage(0);
230   ASSERT_TRUE(page);
231 
232   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
233   ASSERT_TRUE(textpage);
234 
235   EXPECT_EQ(12.0, FPDFText_GetFontSize(textpage, 0));
236 
237   double x = 0.0;
238   double y = 0.0;
239   EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 1, &x, &y));
240   EXPECT_NEAR(6.664, x, 0.001);
241   EXPECT_NEAR(171.508, y, 0.001);
242 
243   EXPECT_TRUE(FPDFText_GetCharOrigin(textpage, 2, &x, &y));
244   EXPECT_NEAR(8.668, x, 0.001);
245   EXPECT_NEAR(160.492, y, 0.001);
246 
247   FS_RECTF rect;
248   EXPECT_TRUE(FPDFText_GetLooseCharBox(textpage, 1, &rect));
249   EXPECT_NEAR(4, rect.left, 0.001);
250   EXPECT_NEAR(16, rect.right, 0.001);
251   EXPECT_NEAR(178.984, rect.bottom, 0.001);
252   EXPECT_NEAR(170.308, rect.top, 0.001);
253 
254   EXPECT_TRUE(FPDFText_GetLooseCharBox(textpage, 2, &rect));
255   EXPECT_NEAR(4, rect.left, 0.001);
256   EXPECT_NEAR(16, rect.right, 0.001);
257   EXPECT_NEAR(170.308, rect.bottom, 0.001);
258   EXPECT_NEAR(159.292, rect.top, 0.001);
259 
260   FPDFText_ClosePage(textpage);
261   UnloadPage(page);
262 }
263 
TEST_F(FPDFTextEmbedderTest,TextHebrewMirrored)264 TEST_F(FPDFTextEmbedderTest, TextHebrewMirrored) {
265   ASSERT_TRUE(OpenDocument("hebrew_mirrored.pdf"));
266   FPDF_PAGE page = LoadPage(0);
267   ASSERT_TRUE(page);
268 
269   {
270     ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
271     ASSERT_TRUE(textpage);
272 
273     constexpr int kCharCount = 10;
274     ASSERT_EQ(kCharCount, FPDFText_CountChars(textpage.get()));
275 
276     unsigned short buffer[kCharCount + 1];
277     memset(buffer, 0x42, sizeof(buffer));
278     EXPECT_EQ(kCharCount + 1,
279               FPDFText_GetText(textpage.get(), 0, kCharCount, buffer));
280     EXPECT_EQ(0x05d1, buffer[0]);
281     EXPECT_EQ(0x05e0, buffer[1]);
282     EXPECT_EQ(0x05d9, buffer[2]);
283     EXPECT_EQ(0x05de, buffer[3]);
284     EXPECT_EQ(0x05d9, buffer[4]);
285     EXPECT_EQ(0x05df, buffer[5]);
286     EXPECT_EQ(0x000d, buffer[6]);
287     EXPECT_EQ(0x000a, buffer[7]);
288     EXPECT_EQ(0x05df, buffer[8]);
289     EXPECT_EQ(0x05d1, buffer[9]);
290   }
291 
292   UnloadPage(page);
293 }
294 
TEST_F(FPDFTextEmbedderTest,TextSearch)295 TEST_F(FPDFTextEmbedderTest, TextSearch) {
296   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
297   FPDF_PAGE page = LoadPage(0);
298   ASSERT_TRUE(page);
299 
300   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
301   ASSERT_TRUE(textpage);
302 
303   ScopedFPDFWideString nope = GetFPDFWideString(L"nope");
304   ScopedFPDFWideString world = GetFPDFWideString(L"world");
305   ScopedFPDFWideString world_caps = GetFPDFWideString(L"WORLD");
306   ScopedFPDFWideString world_substr = GetFPDFWideString(L"orld");
307 
308   {
309     // No occurrences of "nope" in test page.
310     ScopedFPDFTextFind search(FPDFText_FindStart(textpage, nope.get(), 0, 0));
311     EXPECT_TRUE(search);
312     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
313     EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
314 
315     // Advancing finds nothing.
316     EXPECT_FALSE(FPDFText_FindNext(search.get()));
317     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
318     EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
319 
320     // Retreating finds nothing.
321     EXPECT_FALSE(FPDFText_FindPrev(search.get()));
322     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
323     EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
324   }
325 
326   {
327     // Two occurrences of "world" in test page.
328     ScopedFPDFTextFind search(FPDFText_FindStart(textpage, world.get(), 0, 2));
329     EXPECT_TRUE(search);
330 
331     // Remains not found until advanced.
332     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
333     EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
334 
335     // First occurrence of "world" in this test page.
336     EXPECT_TRUE(FPDFText_FindNext(search.get()));
337     EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
338     EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
339 
340     // Last occurrence of "world" in this test page.
341     EXPECT_TRUE(FPDFText_FindNext(search.get()));
342     EXPECT_EQ(24, FPDFText_GetSchResultIndex(search.get()));
343     EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
344 
345     // Found position unchanged when fails to advance.
346     EXPECT_FALSE(FPDFText_FindNext(search.get()));
347     EXPECT_EQ(24, FPDFText_GetSchResultIndex(search.get()));
348     EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
349 
350     // Back to first occurrence.
351     EXPECT_TRUE(FPDFText_FindPrev(search.get()));
352     EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
353     EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
354 
355     // Found position unchanged when fails to retreat.
356     EXPECT_FALSE(FPDFText_FindPrev(search.get()));
357     EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
358     EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
359   }
360 
361   {
362     // Exact search unaffected by case sensitiity and whole word flags.
363     ScopedFPDFTextFind search(FPDFText_FindStart(
364         textpage, world.get(), FPDF_MATCHCASE | FPDF_MATCHWHOLEWORD, 0));
365     EXPECT_TRUE(search);
366     EXPECT_TRUE(FPDFText_FindNext(search.get()));
367     EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
368     EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
369   }
370 
371   {
372     // Default is case-insensitive, so matching agaist caps works.
373     ScopedFPDFTextFind search(
374         FPDFText_FindStart(textpage, world_caps.get(), 0, 0));
375     EXPECT_TRUE(search);
376     EXPECT_TRUE(FPDFText_FindNext(search.get()));
377     EXPECT_EQ(7, FPDFText_GetSchResultIndex(search.get()));
378     EXPECT_EQ(5, FPDFText_GetSchCount(search.get()));
379   }
380 
381   {
382     // But can be made case sensitive, in which case this fails.
383     ScopedFPDFTextFind search(
384         FPDFText_FindStart(textpage, world_caps.get(), FPDF_MATCHCASE, 0));
385     EXPECT_FALSE(FPDFText_FindNext(search.get()));
386     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
387     EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
388   }
389 
390   {
391     // Default is match anywhere within word, so matching substring works.
392     ScopedFPDFTextFind search(
393         FPDFText_FindStart(textpage, world_substr.get(), 0, 0));
394     EXPECT_TRUE(FPDFText_FindNext(search.get()));
395     EXPECT_EQ(8, FPDFText_GetSchResultIndex(search.get()));
396     EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
397   }
398 
399   {
400     // But can be made to mach word boundaries, in which case this fails.
401     ScopedFPDFTextFind search(FPDFText_FindStart(textpage, world_substr.get(),
402                                                  FPDF_MATCHWHOLEWORD, 0));
403     EXPECT_FALSE(FPDFText_FindNext(search.get()));
404     // TODO(tsepez): investigate strange index/count values in this state.
405   }
406 
407   FPDFText_ClosePage(textpage);
408   UnloadPage(page);
409 }
410 
TEST_F(FPDFTextEmbedderTest,TextSearchConsecutive)411 TEST_F(FPDFTextEmbedderTest, TextSearchConsecutive) {
412   ASSERT_TRUE(OpenDocument("find_text_consecutive.pdf"));
413   FPDF_PAGE page = LoadPage(0);
414   ASSERT_TRUE(page);
415 
416   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
417   ASSERT_TRUE(textpage);
418 
419   ScopedFPDFWideString aaaa = GetFPDFWideString(L"aaaa");
420 
421   {
422     // Search for "aaaa" yields 2 results in "aaaaaaaaaa".
423     ScopedFPDFTextFind search(FPDFText_FindStart(textpage, aaaa.get(), 0, 0));
424     EXPECT_TRUE(search);
425 
426     // Remains not found until advanced.
427     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
428     EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
429 
430     // First occurrence of "aaaa" in this test page.
431     EXPECT_TRUE(FPDFText_FindNext(search.get()));
432     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
433     EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
434 
435     // Last occurrence of "aaaa" in this test page.
436     EXPECT_TRUE(FPDFText_FindNext(search.get()));
437     EXPECT_EQ(4, FPDFText_GetSchResultIndex(search.get()));
438     EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
439 
440     // Found position unchanged when fails to advance.
441     EXPECT_FALSE(FPDFText_FindNext(search.get()));
442     EXPECT_EQ(4, FPDFText_GetSchResultIndex(search.get()));
443     EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
444 
445     // Back to first occurrence.
446     EXPECT_TRUE(FPDFText_FindPrev(search.get()));
447     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
448     EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
449 
450     // Found position unchanged when fails to retreat.
451     EXPECT_FALSE(FPDFText_FindPrev(search.get()));
452     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
453     EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
454   }
455 
456   {
457     // Search for "aaaa" yields 7 results in "aaaaaaaaaa", when searching with
458     // FPDF_CONSECUTIVE.
459     ScopedFPDFTextFind search(
460         FPDFText_FindStart(textpage, aaaa.get(), FPDF_CONSECUTIVE, 0));
461     EXPECT_TRUE(search);
462 
463     // Remains not found until advanced.
464     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
465     EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
466 
467     // Find consecutive occurrences of "aaaa" in this test page:
468     for (int i = 0; i < 7; ++i) {
469       EXPECT_TRUE(FPDFText_FindNext(search.get()));
470       EXPECT_EQ(i, FPDFText_GetSchResultIndex(search.get()));
471       EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
472     }
473 
474     // Found position unchanged when fails to advance.
475     EXPECT_FALSE(FPDFText_FindNext(search.get()));
476     EXPECT_EQ(6, FPDFText_GetSchResultIndex(search.get()));
477     EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
478 
479     for (int i = 5; i >= 0; --i) {
480       EXPECT_TRUE(FPDFText_FindPrev(search.get()));
481       EXPECT_EQ(i, FPDFText_GetSchResultIndex(search.get()));
482       EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
483     }
484 
485     // Found position unchanged when fails to retreat.
486     EXPECT_FALSE(FPDFText_FindPrev(search.get()));
487     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
488     EXPECT_EQ(4, FPDFText_GetSchCount(search.get()));
489   }
490 
491   FPDFText_ClosePage(textpage);
492   UnloadPage(page);
493 }
494 
495 // Fails on Windows. https://crbug.com/pdfium/1370
496 #if BUILDFLAG(IS_WIN)
497 #define MAYBE_TextSearchLatinExtended DISABLED_TextSearchLatinExtended
498 #else
499 #define MAYBE_TextSearchLatinExtended TextSearchLatinExtended
500 #endif
TEST_F(FPDFTextEmbedderTest,MAYBE_TextSearchLatinExtended)501 TEST_F(FPDFTextEmbedderTest, MAYBE_TextSearchLatinExtended) {
502   ASSERT_TRUE(OpenDocument("latin_extended.pdf"));
503   FPDF_PAGE page = LoadPage(0);
504   ASSERT_TRUE(page);
505 
506   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
507   ASSERT_TRUE(textpage);
508 
509   // Upper/lowercase 'a' with breve.
510   constexpr FPDF_WCHAR kNeedleUpper[] = {0x0102, 0x0000};
511   constexpr FPDF_WCHAR kNeedleLower[] = {0x0103, 0x0000};
512 
513   for (const auto* needle : {kNeedleUpper, kNeedleLower}) {
514     ScopedFPDFTextFind search(FPDFText_FindStart(textpage, needle, 0, 0));
515     EXPECT_TRUE(search);
516     EXPECT_EQ(0, FPDFText_GetSchResultIndex(search.get()));
517     EXPECT_EQ(0, FPDFText_GetSchCount(search.get()));
518 
519     // Should find 2 results at position 21/22, both with length 1.
520     EXPECT_TRUE(FPDFText_FindNext(search.get()));
521     EXPECT_EQ(2, FPDFText_GetSchResultIndex(search.get()));
522     EXPECT_EQ(1, FPDFText_GetSchCount(search.get()));
523     EXPECT_TRUE(FPDFText_FindNext(search.get()));
524     EXPECT_EQ(3, FPDFText_GetSchResultIndex(search.get()));
525     EXPECT_EQ(1, FPDFText_GetSchCount(search.get()));
526     // And no more than 2 results.
527     EXPECT_FALSE(FPDFText_FindNext(search.get()));
528   }
529 
530   FPDFText_ClosePage(textpage);
531   UnloadPage(page);
532 }
533 
534 // Test that the page has characters despite a bad stream length.
TEST_F(FPDFTextEmbedderTest,StreamLengthPastEndOfFile)535 TEST_F(FPDFTextEmbedderTest, StreamLengthPastEndOfFile) {
536   ASSERT_TRUE(OpenDocument("bug_57.pdf"));
537   FPDF_PAGE page = LoadPage(0);
538   ASSERT_TRUE(page);
539 
540   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
541   ASSERT_TRUE(textpage);
542   EXPECT_EQ(13, FPDFText_CountChars(textpage));
543 
544   FPDFText_ClosePage(textpage);
545   UnloadPage(page);
546 }
547 
TEST_F(FPDFTextEmbedderTest,WebLinks)548 TEST_F(FPDFTextEmbedderTest, WebLinks) {
549   ASSERT_TRUE(OpenDocument("weblinks.pdf"));
550   FPDF_PAGE page = LoadPage(0);
551   ASSERT_TRUE(page);
552 
553   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
554   ASSERT_TRUE(textpage);
555 
556   {
557     ScopedFPDFPageLink pagelink(FPDFLink_LoadWebLinks(textpage));
558     EXPECT_TRUE(pagelink);
559 
560     // Page contains two HTTP-style URLs.
561     EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink.get()));
562 
563     // Only a terminating NUL required for bogus links.
564     EXPECT_EQ(1, FPDFLink_GetURL(pagelink.get(), 2, nullptr, 0));
565     EXPECT_EQ(1, FPDFLink_GetURL(pagelink.get(), 1400, nullptr, 0));
566     EXPECT_EQ(1, FPDFLink_GetURL(pagelink.get(), -1, nullptr, 0));
567   }
568 
569   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
570   EXPECT_TRUE(pagelink);
571 
572   // Query the number of characters required for each link (incl NUL).
573   EXPECT_EQ(25, FPDFLink_GetURL(pagelink, 0, nullptr, 0));
574   EXPECT_EQ(26, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
575 
576   static const char expected_url[] = "http://example.com?q=foo";
577   static const size_t expected_len = sizeof(expected_url);
578   unsigned short buffer[128];
579 
580   // Retrieve a link with too small a buffer.  Buffer will not be
581   // NUL-terminated, but must not be modified past indicated length,
582   // so pre-fill with a pattern to check write bounds.
583   memset(buffer, 0xbd, sizeof(buffer));
584   EXPECT_EQ(1, FPDFLink_GetURL(pagelink, 0, buffer, 1));
585   EXPECT_TRUE(check_unsigned_shorts(expected_url, buffer, 1));
586   EXPECT_EQ(0xbdbd, buffer[1]);
587 
588   // Check buffer that doesn't have space for a terminating NUL.
589   memset(buffer, 0xbd, sizeof(buffer));
590   EXPECT_EQ(static_cast<int>(expected_len - 1),
591             FPDFLink_GetURL(pagelink, 0, buffer, expected_len - 1));
592   EXPECT_TRUE(check_unsigned_shorts(expected_url, buffer, expected_len - 1));
593   EXPECT_EQ(0xbdbd, buffer[expected_len - 1]);
594 
595   // Retreive link with exactly-sized buffer.
596   memset(buffer, 0xbd, sizeof(buffer));
597   EXPECT_EQ(static_cast<int>(expected_len),
598             FPDFLink_GetURL(pagelink, 0, buffer, expected_len));
599   EXPECT_TRUE(check_unsigned_shorts(expected_url, buffer, expected_len));
600   EXPECT_EQ(0u, buffer[expected_len - 1]);
601   EXPECT_EQ(0xbdbd, buffer[expected_len]);
602 
603   // Retreive link with ample-sized-buffer.
604   memset(buffer, 0xbd, sizeof(buffer));
605   EXPECT_EQ(static_cast<int>(expected_len),
606             FPDFLink_GetURL(pagelink, 0, buffer, 128));
607   EXPECT_TRUE(check_unsigned_shorts(expected_url, buffer, expected_len));
608   EXPECT_EQ(0u, buffer[expected_len - 1]);
609   EXPECT_EQ(0xbdbd, buffer[expected_len]);
610 
611   // Each link rendered in a single rect in this test page.
612   EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 0));
613   EXPECT_EQ(1, FPDFLink_CountRects(pagelink, 1));
614 
615   // Each link rendered in a single rect in this test page.
616   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, -1));
617   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 2));
618   EXPECT_EQ(0, FPDFLink_CountRects(pagelink, 10000));
619 
620   // Check boundary of valid link index with valid rect index.
621   double left = 0.0;
622   double right = 0.0;
623   double top = 0.0;
624   double bottom = 0.0;
625   EXPECT_TRUE(FPDFLink_GetRect(pagelink, 0, 0, &left, &top, &right, &bottom));
626   EXPECT_NEAR(50.828, left, 0.001);
627   EXPECT_NEAR(187.904, right, 0.001);
628   EXPECT_NEAR(97.516, bottom, 0.001);
629   EXPECT_NEAR(108.700, top, 0.001);
630 
631   // Check that valid link with invalid rect index leaves parameters unchanged.
632   left = -1.0;
633   right = -1.0;
634   top = -1.0;
635   bottom = -1.0;
636   EXPECT_FALSE(FPDFLink_GetRect(pagelink, 0, 1, &left, &top, &right, &bottom));
637   EXPECT_EQ(-1.0, left);
638   EXPECT_EQ(-1.0, right);
639   EXPECT_EQ(-1.0, bottom);
640   EXPECT_EQ(-1.0, top);
641 
642   // Check that invalid link index leaves parameters unchanged.
643   left = -2.0;
644   right = -2.0;
645   top = -2.0;
646   bottom = -2.0;
647   EXPECT_FALSE(FPDFLink_GetRect(pagelink, -1, 0, &left, &top, &right, &bottom));
648   EXPECT_EQ(-2.0, left);
649   EXPECT_EQ(-2.0, right);
650   EXPECT_EQ(-2.0, bottom);
651   EXPECT_EQ(-2.0, top);
652 
653   FPDFLink_CloseWebLinks(pagelink);
654   FPDFText_ClosePage(textpage);
655   UnloadPage(page);
656 }
657 
TEST_F(FPDFTextEmbedderTest,WebLinksAcrossLines)658 TEST_F(FPDFTextEmbedderTest, WebLinksAcrossLines) {
659   ASSERT_TRUE(OpenDocument("weblinks_across_lines.pdf"));
660   FPDF_PAGE page = LoadPage(0);
661   ASSERT_TRUE(page);
662 
663   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
664   ASSERT_TRUE(textpage);
665 
666   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
667   EXPECT_TRUE(pagelink);
668 
669   static const char* const kExpectedUrls[] = {
670       "http://example.com",           // from "http://www.example.com?\r\nfoo"
671       "http://example.com/",          // from "http://www.example.com/\r\nfoo"
672       "http://example.com/test-foo",  // from "http://example.com/test-\r\nfoo"
673       "http://abc.com/test-foo",      // from "http://abc.com/test-\r\n\r\nfoo"
674       // Next two links from "http://www.example.com/\r\nhttp://www.abc.com/"
675       "http://example.com/",
676       "http://www.abc.com",
677   };
678   static const int kNumLinks = static_cast<int>(std::size(kExpectedUrls));
679 
680   EXPECT_EQ(kNumLinks, FPDFLink_CountWebLinks(pagelink));
681 
682   unsigned short buffer[128];
683   for (int i = 0; i < kNumLinks; i++) {
684     const size_t expected_len = strlen(kExpectedUrls[i]) + 1;
685     memset(buffer, 0, sizeof(buffer));
686     EXPECT_EQ(static_cast<int>(expected_len),
687               FPDFLink_GetURL(pagelink, i, nullptr, 0));
688     EXPECT_EQ(static_cast<int>(expected_len),
689               FPDFLink_GetURL(pagelink, i, buffer, std::size(buffer)));
690     EXPECT_TRUE(check_unsigned_shorts(kExpectedUrls[i], buffer, expected_len));
691   }
692 
693   FPDFLink_CloseWebLinks(pagelink);
694   FPDFText_ClosePage(textpage);
695   UnloadPage(page);
696 }
697 
TEST_F(FPDFTextEmbedderTest,WebLinksAcrossLinesBug)698 TEST_F(FPDFTextEmbedderTest, WebLinksAcrossLinesBug) {
699   ASSERT_TRUE(OpenDocument("bug_650.pdf"));
700   FPDF_PAGE page = LoadPage(0);
701   ASSERT_TRUE(page);
702 
703   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
704   ASSERT_TRUE(textpage);
705 
706   FPDF_PAGELINK pagelink = FPDFLink_LoadWebLinks(textpage);
707   EXPECT_TRUE(pagelink);
708 
709   EXPECT_EQ(2, FPDFLink_CountWebLinks(pagelink));
710   unsigned short buffer[128] = {0};
711   static const char kExpectedUrl[] =
712       "http://tutorial45.com/learn-autocad-basics-day-166/";
713   static const int kUrlSize = static_cast<int>(sizeof(kExpectedUrl));
714 
715   EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, nullptr, 0));
716   EXPECT_EQ(kUrlSize, FPDFLink_GetURL(pagelink, 1, buffer, std::size(buffer)));
717   EXPECT_TRUE(check_unsigned_shorts(kExpectedUrl, buffer, kUrlSize));
718 
719   FPDFLink_CloseWebLinks(pagelink);
720   FPDFText_ClosePage(textpage);
721   UnloadPage(page);
722 }
723 
TEST_F(FPDFTextEmbedderTest,WebLinksCharRanges)724 TEST_F(FPDFTextEmbedderTest, WebLinksCharRanges) {
725   ASSERT_TRUE(OpenDocument("weblinks.pdf"));
726   FPDF_PAGE page = LoadPage(0);
727   ASSERT_TRUE(page);
728 
729   FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
730   ASSERT_TRUE(text_page);
731 
732   FPDF_PAGELINK page_link = FPDFLink_LoadWebLinks(text_page);
733   EXPECT_TRUE(page_link);
734 
735   // Test for char indices of a valid link
736   int start_char_index;
737   int char_count;
738   ASSERT_TRUE(
739       FPDFLink_GetTextRange(page_link, 0, &start_char_index, &char_count));
740   EXPECT_EQ(35, start_char_index);
741   EXPECT_EQ(24, char_count);
742 
743   // Test for char indices of an invalid link
744   start_char_index = -10;
745   char_count = -8;
746   ASSERT_FALSE(
747       FPDFLink_GetTextRange(page_link, 6, &start_char_index, &char_count));
748   EXPECT_EQ(start_char_index, -10);
749   EXPECT_EQ(char_count, -8);
750 
751   // Test for pagelink = nullptr
752   start_char_index = -10;
753   char_count = -8;
754   ASSERT_FALSE(
755       FPDFLink_GetTextRange(nullptr, 0, &start_char_index, &char_count));
756   EXPECT_EQ(start_char_index, -10);
757   EXPECT_EQ(char_count, -8);
758 
759   // Test for link_index < 0
760   start_char_index = -10;
761   char_count = -8;
762   ASSERT_FALSE(
763       FPDFLink_GetTextRange(page_link, -4, &start_char_index, &char_count));
764   EXPECT_EQ(start_char_index, -10);
765   EXPECT_EQ(char_count, -8);
766 
767   FPDFLink_CloseWebLinks(page_link);
768   FPDFText_ClosePage(text_page);
769   UnloadPage(page);
770 }
771 
TEST_F(FPDFTextEmbedderTest,AnnotLinks)772 TEST_F(FPDFTextEmbedderTest, AnnotLinks) {
773   ASSERT_TRUE(OpenDocument("annots.pdf"));
774   FPDF_PAGE page = LoadPage(0);
775   ASSERT_TRUE(page);
776 
777   // Get link count via checking annotation subtype
778   int annot_count = FPDFPage_GetAnnotCount(page);
779   ASSERT_EQ(9, annot_count);
780   int annot_subtype_link_count = 0;
781   for (int i = 0; i < annot_count; ++i) {
782     ScopedFPDFAnnotation annot(FPDFPage_GetAnnot(page, i));
783     if (FPDFAnnot_GetSubtype(annot.get()) == FPDF_ANNOT_LINK) {
784       ++annot_subtype_link_count;
785     }
786   }
787   EXPECT_EQ(4, annot_subtype_link_count);
788 
789   // Validate that FPDFLink_Enumerate() returns same number of links
790   int start_pos = 0;
791   FPDF_LINK link_annot;
792   int link_count = 0;
793   while (FPDFLink_Enumerate(page, &start_pos, &link_annot)) {
794     ASSERT_TRUE(link_annot);
795     if (start_pos == 1 || start_pos == 2) {
796       // First two links point to first and second page within the document
797       // respectively
798       FPDF_DEST link_dest = FPDFLink_GetDest(document(), link_annot);
799       EXPECT_TRUE(link_dest);
800       EXPECT_EQ(start_pos - 1,
801                 FPDFDest_GetDestPageIndex(document(), link_dest));
802     } else if (start_pos == 3) {  // points to PDF Spec URL
803       FS_RECTF link_rect;
804       EXPECT_TRUE(FPDFLink_GetAnnotRect(link_annot, &link_rect));
805       EXPECT_NEAR(66.0, link_rect.left, 0.001);
806       EXPECT_NEAR(544.0, link_rect.top, 0.001);
807       EXPECT_NEAR(196.0, link_rect.right, 0.001);
808       EXPECT_NEAR(529.0, link_rect.bottom, 0.001);
809     } else if (start_pos == 4) {  // this link has quad points
810       int quad_point_count = FPDFLink_CountQuadPoints(link_annot);
811       EXPECT_EQ(1, quad_point_count);
812       FS_QUADPOINTSF quad_points;
813       EXPECT_TRUE(FPDFLink_GetQuadPoints(link_annot, 0, &quad_points));
814       EXPECT_NEAR(83.0, quad_points.x1, 0.001);
815       EXPECT_NEAR(453.0, quad_points.y1, 0.001);
816       EXPECT_NEAR(178.0, quad_points.x2, 0.001);
817       EXPECT_NEAR(453.0, quad_points.y2, 0.001);
818       EXPECT_NEAR(83.0, quad_points.x3, 0.001);
819       EXPECT_NEAR(440.0, quad_points.y3, 0.001);
820       EXPECT_NEAR(178.0, quad_points.x4, 0.001);
821       EXPECT_NEAR(440.0, quad_points.y4, 0.001);
822       // AnnotRect is same as quad points for this link
823       FS_RECTF link_rect;
824       EXPECT_TRUE(FPDFLink_GetAnnotRect(link_annot, &link_rect));
825       EXPECT_NEAR(link_rect.left, quad_points.x1, 0.001);
826       EXPECT_NEAR(link_rect.top, quad_points.y1, 0.001);
827       EXPECT_NEAR(link_rect.right, quad_points.x4, 0.001);
828       EXPECT_NEAR(link_rect.bottom, quad_points.y4, 0.001);
829     }
830     ++link_count;
831   }
832   EXPECT_EQ(annot_subtype_link_count, link_count);
833 
834   UnloadPage(page);
835 }
836 
TEST_F(FPDFTextEmbedderTest,GetFontSize)837 TEST_F(FPDFTextEmbedderTest, GetFontSize) {
838   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
839   FPDF_PAGE page = LoadPage(0);
840   ASSERT_TRUE(page);
841 
842   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
843   ASSERT_TRUE(textpage);
844 
845   const double kExpectedFontsSizes[] = {12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
846                                         12, 12, 12, 1,  1,  16, 16, 16, 16, 16,
847                                         16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
848 
849   int count = FPDFText_CountChars(textpage);
850   ASSERT_EQ(std::size(kExpectedFontsSizes), static_cast<size_t>(count));
851   for (int i = 0; i < count; ++i)
852     EXPECT_EQ(kExpectedFontsSizes[i], FPDFText_GetFontSize(textpage, i)) << i;
853 
854   FPDFText_ClosePage(textpage);
855   UnloadPage(page);
856 }
857 
TEST_F(FPDFTextEmbedderTest,GetFontInfo)858 TEST_F(FPDFTextEmbedderTest, GetFontInfo) {
859   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
860   FPDF_PAGE page = LoadPage(0);
861   ASSERT_TRUE(page);
862 
863   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
864   ASSERT_TRUE(textpage);
865   std::vector<char> font_name;
866   size_t num_chars1 = strlen("Hello, world!");
867   const char kExpectedFontName1[] = "Times-Roman";
868 
869   for (size_t i = 0; i < num_chars1; i++) {
870     int flags = -1;
871     unsigned long length =
872         FPDFText_GetFontInfo(textpage, i, nullptr, 0, &flags);
873     static constexpr unsigned long expected_length = sizeof(kExpectedFontName1);
874     ASSERT_EQ(expected_length, length);
875     EXPECT_EQ(FXFONT_NONSYMBOLIC, flags);
876     font_name.resize(length);
877     std::fill(font_name.begin(), font_name.end(), 'a');
878     flags = -1;
879     EXPECT_EQ(expected_length,
880               FPDFText_GetFontInfo(textpage, i, font_name.data(),
881                                    font_name.size(), &flags));
882     EXPECT_STREQ(kExpectedFontName1, font_name.data());
883     EXPECT_EQ(FXFONT_NONSYMBOLIC, flags);
884   }
885   // If the size of the buffer is not large enough, the buffer should remain
886   // unchanged.
887   font_name.pop_back();
888   std::fill(font_name.begin(), font_name.end(), 'a');
889   EXPECT_EQ(sizeof(kExpectedFontName1),
890             FPDFText_GetFontInfo(textpage, 0, font_name.data(),
891                                  font_name.size(), nullptr));
892   for (char a : font_name)
893     EXPECT_EQ('a', a);
894 
895   // The text is "Hello, world!\r\nGoodbye, world!", so the next two characters
896   // do not have any font information.
897   EXPECT_EQ(0u, FPDFText_GetFontInfo(textpage, num_chars1, font_name.data(),
898                                      font_name.size(), nullptr));
899   EXPECT_EQ(0u, FPDFText_GetFontInfo(textpage, num_chars1 + 1, font_name.data(),
900                                      font_name.size(), nullptr));
901 
902   size_t num_chars2 = strlen("Goodbye, world!");
903   const char kExpectedFontName2[] = "Helvetica";
904   for (size_t i = num_chars1 + 2; i < num_chars1 + num_chars2 + 2; i++) {
905     int flags = -1;
906     unsigned long length =
907         FPDFText_GetFontInfo(textpage, i, nullptr, 0, &flags);
908     static constexpr unsigned long expected_length = sizeof(kExpectedFontName2);
909     ASSERT_EQ(expected_length, length);
910     EXPECT_EQ(FXFONT_NONSYMBOLIC, flags);
911     font_name.resize(length);
912     std::fill(font_name.begin(), font_name.end(), 'a');
913     flags = -1;
914     EXPECT_EQ(expected_length,
915               FPDFText_GetFontInfo(textpage, i, font_name.data(),
916                                    font_name.size(), &flags));
917     EXPECT_STREQ(kExpectedFontName2, font_name.data());
918     EXPECT_EQ(FXFONT_NONSYMBOLIC, flags);
919   }
920 
921   // Now try some out of bounds indices and null pointers to make sure we do not
922   // crash.
923   // No textpage.
924   EXPECT_EQ(0u, FPDFText_GetFontInfo(nullptr, 0, font_name.data(),
925                                      font_name.size(), nullptr));
926   // No buffer.
927   EXPECT_EQ(sizeof(kExpectedFontName1),
928             FPDFText_GetFontInfo(textpage, 0, nullptr, 0, nullptr));
929   // Negative index.
930   EXPECT_EQ(0u, FPDFText_GetFontInfo(textpage, -1, font_name.data(),
931                                      font_name.size(), nullptr));
932   // Out of bounds index.
933   EXPECT_EQ(0u, FPDFText_GetFontInfo(textpage, 1000, font_name.data(),
934                                      font_name.size(), nullptr));
935 
936   FPDFText_ClosePage(textpage);
937   UnloadPage(page);
938 }
939 
TEST_F(FPDFTextEmbedderTest,ToUnicode)940 TEST_F(FPDFTextEmbedderTest, ToUnicode) {
941   ASSERT_TRUE(OpenDocument("bug_583.pdf"));
942   FPDF_PAGE page = LoadPage(0);
943   ASSERT_TRUE(page);
944 
945   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
946   ASSERT_TRUE(textpage);
947 
948   ASSERT_EQ(1, FPDFText_CountChars(textpage));
949   EXPECT_EQ(0U, FPDFText_GetUnicode(textpage, 0));
950 
951   FPDFText_ClosePage(textpage);
952   UnloadPage(page);
953 }
954 
TEST_F(FPDFTextEmbedderTest,IsGenerated)955 TEST_F(FPDFTextEmbedderTest, IsGenerated) {
956   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
957   FPDF_PAGE page = LoadPage(0);
958   ASSERT_TRUE(page);
959 
960   {
961     ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
962     ASSERT_TRUE(textpage);
963 
964     EXPECT_EQ(static_cast<unsigned int>('H'),
965               FPDFText_GetUnicode(textpage.get(), 0));
966     EXPECT_EQ(0, FPDFText_IsGenerated(textpage.get(), 0));
967     EXPECT_EQ(static_cast<unsigned int>(' '),
968               FPDFText_GetUnicode(textpage.get(), 6));
969     EXPECT_EQ(0, FPDFText_IsGenerated(textpage.get(), 6));
970 
971     EXPECT_EQ(static_cast<unsigned int>('\r'),
972               FPDFText_GetUnicode(textpage.get(), 13));
973     EXPECT_EQ(1, FPDFText_IsGenerated(textpage.get(), 13));
974     EXPECT_EQ(static_cast<unsigned int>('\n'),
975               FPDFText_GetUnicode(textpage.get(), 14));
976     EXPECT_EQ(1, FPDFText_IsGenerated(textpage.get(), 14));
977 
978     EXPECT_EQ(-1, FPDFText_IsGenerated(textpage.get(), -1));
979     EXPECT_EQ(-1, FPDFText_IsGenerated(textpage.get(), kHelloGoodbyeTextSize));
980     EXPECT_EQ(-1, FPDFText_IsGenerated(nullptr, 6));
981   }
982 
983   UnloadPage(page);
984 }
985 
TEST_F(FPDFTextEmbedderTest,IsInvalidUnicode)986 TEST_F(FPDFTextEmbedderTest, IsInvalidUnicode) {
987   ASSERT_TRUE(OpenDocument("bug_1388_2.pdf"));
988   FPDF_PAGE page = LoadPage(0);
989   ASSERT_TRUE(page);
990 
991   {
992     constexpr int kExpectedCharCount = 5;
993     ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
994     ASSERT_TRUE(textpage);
995     EXPECT_EQ(kExpectedCharCount, FPDFText_CountChars(textpage.get()));
996 
997     EXPECT_EQ(static_cast<unsigned int>('X'),
998               FPDFText_GetUnicode(textpage.get(), 0));
999     EXPECT_EQ(0, FPDFText_HasUnicodeMapError(textpage.get(), 0));
1000     EXPECT_EQ(static_cast<unsigned int>(' '),
1001               FPDFText_GetUnicode(textpage.get(), 1));
1002     EXPECT_EQ(0, FPDFText_HasUnicodeMapError(textpage.get(), 1));
1003 
1004     EXPECT_EQ(31u, FPDFText_GetUnicode(textpage.get(), 2));
1005     EXPECT_EQ(1, FPDFText_HasUnicodeMapError(textpage.get(), 2));
1006 
1007     EXPECT_EQ(-1, FPDFText_HasUnicodeMapError(textpage.get(), -1));
1008     EXPECT_EQ(-1,
1009               FPDFText_HasUnicodeMapError(textpage.get(), kExpectedCharCount));
1010     EXPECT_EQ(-1, FPDFText_HasUnicodeMapError(nullptr, 0));
1011   }
1012 
1013   UnloadPage(page);
1014 }
1015 
TEST_F(FPDFTextEmbedderTest,Bug_921)1016 TEST_F(FPDFTextEmbedderTest, Bug_921) {
1017   ASSERT_TRUE(OpenDocument("bug_921.pdf"));
1018   FPDF_PAGE page = LoadPage(0);
1019   ASSERT_TRUE(page);
1020 
1021   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1022   ASSERT_TRUE(textpage);
1023 
1024   static constexpr unsigned int kData[] = {
1025       1095, 1077, 1083, 1086, 1074, 1077, 1095, 1077, 1089, 1082, 1086, 1077,
1026       32,   1089, 1090, 1088, 1072, 1076, 1072, 1085, 1080, 1077, 46,   32};
1027   static constexpr int kStartIndex = 238;
1028 
1029   ASSERT_EQ(268, FPDFText_CountChars(textpage));
1030   for (size_t i = 0; i < std::size(kData); ++i)
1031     EXPECT_EQ(kData[i], FPDFText_GetUnicode(textpage, kStartIndex + i));
1032 
1033   unsigned short buffer[std::size(kData) + 1];
1034   memset(buffer, 0xbd, sizeof(buffer));
1035   int count = FPDFText_GetText(textpage, kStartIndex, std::size(kData), buffer);
1036   ASSERT_GT(count, 0);
1037   ASSERT_EQ(std::size(kData) + 1, static_cast<size_t>(count));
1038   for (size_t i = 0; i < std::size(kData); ++i)
1039     EXPECT_EQ(kData[i], buffer[i]);
1040   EXPECT_EQ(0, buffer[std::size(kData)]);
1041 
1042   FPDFText_ClosePage(textpage);
1043   UnloadPage(page);
1044 }
1045 
TEST_F(FPDFTextEmbedderTest,GetTextWithHyphen)1046 TEST_F(FPDFTextEmbedderTest, GetTextWithHyphen) {
1047   ASSERT_TRUE(OpenDocument("bug_781804.pdf"));
1048   FPDF_PAGE page = LoadPage(0);
1049   ASSERT_TRUE(page);
1050 
1051   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1052   ASSERT_TRUE(textpage);
1053 
1054   // Check that soft hyphens are not included
1055   // Expecting 'Veritaserum', except there is a \uFFFE where the hyphen was in
1056   // the original text. This is a weird thing that Adobe does, which we
1057   // replicate.
1058   constexpr unsigned short soft_expected[] = {
1059       0x0056, 0x0065, 0x0072, 0x0069, 0x0074, 0x0061, 0xfffe,
1060       0x0073, 0x0065, 0x0072, 0x0075, 0x006D, 0x0000};
1061   {
1062     constexpr int count = std::size(soft_expected) - 1;
1063     unsigned short buffer[std::size(soft_expected)];
1064     memset(buffer, 0, sizeof(buffer));
1065 
1066     EXPECT_EQ(count + 1, FPDFText_GetText(textpage, 0, count, buffer));
1067     for (int i = 0; i < count; i++)
1068       EXPECT_EQ(soft_expected[i], buffer[i]);
1069   }
1070 
1071   // Check that hard hyphens are included
1072   {
1073     // There isn't the \0 in the actual doc, but there is a \r\n, so need to
1074     // add 1 to get aligned.
1075     constexpr size_t offset = std::size(soft_expected) + 1;
1076     // Expecting 'User-\r\ngenerated', the - is a unicode character, so cannot
1077     // store in a char[].
1078     constexpr unsigned short hard_expected[] = {
1079         0x0055, 0x0073, 0x0065, 0x0072, 0x2010, 0x000d, 0x000a, 0x0067, 0x0065,
1080         0x006e, 0x0065, 0x0072, 0x0061, 0x0074, 0x0065, 0x0064, 0x0000};
1081     constexpr int count = std::size(hard_expected) - 1;
1082     unsigned short buffer[std::size(hard_expected)];
1083 
1084     EXPECT_EQ(count + 1, FPDFText_GetText(textpage, offset, count, buffer));
1085     for (int i = 0; i < count; i++)
1086       EXPECT_EQ(hard_expected[i], buffer[i]);
1087   }
1088 
1089   FPDFText_ClosePage(textpage);
1090   UnloadPage(page);
1091 }
1092 
TEST_F(FPDFTextEmbedderTest,bug_782596)1093 TEST_F(FPDFTextEmbedderTest, bug_782596) {
1094   // If there is a regression in this test, it will only fail under ASAN
1095   ASSERT_TRUE(OpenDocument("bug_782596.pdf"));
1096   FPDF_PAGE page = LoadPage(0);
1097   ASSERT_TRUE(page);
1098   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1099   ASSERT_TRUE(textpage);
1100   FPDFText_ClosePage(textpage);
1101   UnloadPage(page);
1102 }
1103 
TEST_F(FPDFTextEmbedderTest,ControlCharacters)1104 TEST_F(FPDFTextEmbedderTest, ControlCharacters) {
1105   ASSERT_TRUE(OpenDocument("control_characters.pdf"));
1106   FPDF_PAGE page = LoadPage(0);
1107   ASSERT_TRUE(page);
1108 
1109   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1110   ASSERT_TRUE(textpage);
1111 
1112   // Should not include the control characters in the output
1113   unsigned short buffer[128];
1114   memset(buffer, 0xbd, sizeof(buffer));
1115   int num_chars = FPDFText_GetText(textpage, 0, 128, buffer);
1116   ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
1117   EXPECT_TRUE(
1118       check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize));
1119 
1120   // Attempting to get a chunk of text after the control characters
1121   static const char expected_substring[] = "Goodbye, world!";
1122   // Offset is the length of 'Hello, world!\r\n' + 2 control characters in the
1123   // original stream
1124   static const int offset = 17;
1125   memset(buffer, 0xbd, sizeof(buffer));
1126   num_chars = FPDFText_GetText(textpage, offset, 128, buffer);
1127 
1128   ASSERT_GE(num_chars, 0);
1129   EXPECT_EQ(sizeof(expected_substring), static_cast<size_t>(num_chars));
1130   EXPECT_TRUE(check_unsigned_shorts(expected_substring, buffer,
1131                                     sizeof(expected_substring)));
1132 
1133   FPDFText_ClosePage(textpage);
1134   UnloadPage(page);
1135 }
1136 
1137 // Testing that hyphen makers (0x0002) are replacing hard hyphens when
1138 // the word contains non-ASCII characters.
TEST_F(FPDFTextEmbedderTest,bug_1029)1139 TEST_F(FPDFTextEmbedderTest, bug_1029) {
1140   ASSERT_TRUE(OpenDocument("bug_1029.pdf"));
1141   FPDF_PAGE page = LoadPage(0);
1142   ASSERT_TRUE(page);
1143 
1144   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1145   ASSERT_TRUE(textpage);
1146 
1147   constexpr int page_range_offset = 171;
1148   constexpr int page_range_length = 56;
1149 
1150   // This text is:
1151   // 'METADATA table. When the split has committed, it noti' followed
1152   // by a 'soft hyphen' (0x0002) and then 'fi'.
1153   //
1154   // The original text has a fi ligature, but that is broken up into
1155   // two characters when the PDF is processed.
1156   constexpr unsigned int expected[] = {
1157       0x004d, 0x0045, 0x0054, 0x0041, 0x0044, 0x0041, 0x0054, 0x0041,
1158       0x0020, 0x0074, 0x0061, 0x0062, 0x006c, 0x0065, 0x002e, 0x0020,
1159       0x0057, 0x0068, 0x0065, 0x006e, 0x0020, 0x0074, 0x0068, 0x0065,
1160       0x0020, 0x0073, 0x0070, 0x006c, 0x0069, 0x0074, 0x0020, 0x0068,
1161       0x0061, 0x0073, 0x0020, 0x0063, 0x006f, 0x006d, 0x006d, 0x0069,
1162       0x0074, 0x0074, 0x0065, 0x0064, 0x002c, 0x0020, 0x0069, 0x0074,
1163       0x0020, 0x006e, 0x006f, 0x0074, 0x0069, 0x0002, 0x0066, 0x0069};
1164   static_assert(page_range_length == std::size(expected),
1165                 "Expected should be the same size as the range being "
1166                 "extracted from page.");
1167   EXPECT_LT(page_range_offset + page_range_length,
1168             FPDFText_CountChars(textpage));
1169 
1170   for (int i = 0; i < page_range_length; ++i) {
1171     EXPECT_EQ(expected[i],
1172               FPDFText_GetUnicode(textpage, page_range_offset + i));
1173   }
1174 
1175   FPDFText_ClosePage(textpage);
1176   UnloadPage(page);
1177 }
1178 
TEST_F(FPDFTextEmbedderTest,CountRects)1179 TEST_F(FPDFTextEmbedderTest, CountRects) {
1180   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
1181   FPDF_PAGE page = LoadPage(0);
1182   ASSERT_TRUE(page);
1183 
1184   FPDF_TEXTPAGE textpage = FPDFText_LoadPage(page);
1185   ASSERT_TRUE(textpage);
1186 
1187   // Sanity check hello_world.pdf.
1188   // |num_chars| check includes the terminating NUL that is provided.
1189   {
1190     unsigned short buffer[128];
1191     int num_chars = FPDFText_GetText(textpage, 0, 128, buffer);
1192     ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
1193     EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText, buffer,
1194                                       kHelloGoodbyeTextSize));
1195   }
1196 
1197   // Now test FPDFText_CountRects().
1198   static const int kHelloWorldEnd = strlen("Hello, world!");
1199   static const int kGoodbyeWorldStart = kHelloWorldEnd + 2;  // "\r\n"
1200   for (int start = 0; start < kHelloWorldEnd; ++start) {
1201     // Always grab some part of "hello world" and some part of "goodbye world"
1202     // Since -1 means "all".
1203     EXPECT_EQ(2, FPDFText_CountRects(textpage, start, -1));
1204 
1205     // No characters always means 0 rects.
1206     EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 0));
1207 
1208     // 1 character stays within "hello world"
1209     EXPECT_EQ(1, FPDFText_CountRects(textpage, start, 1));
1210 
1211     // When |start| is 0, Having |kGoodbyeWorldStart| char count does not reach
1212     // "goodbye world".
1213     int expected_value = start ? 2 : 1;
1214     EXPECT_EQ(expected_value,
1215               FPDFText_CountRects(textpage, start, kGoodbyeWorldStart));
1216 
1217     // Extremely large character count will always return 2 rects because
1218     // |start| starts inside "hello world".
1219     EXPECT_EQ(2, FPDFText_CountRects(textpage, start, 500));
1220   }
1221 
1222   // Now test negative counts.
1223   for (int start = 0; start < kHelloWorldEnd; ++start) {
1224     EXPECT_EQ(2, FPDFText_CountRects(textpage, start, -100));
1225     EXPECT_EQ(2, FPDFText_CountRects(textpage, start, -2));
1226   }
1227 
1228   // Now test larger start values.
1229   const int kExpectedLength = strlen(kHelloGoodbyeText);
1230   for (int start = kGoodbyeWorldStart + 1; start < kExpectedLength; ++start) {
1231     EXPECT_EQ(1, FPDFText_CountRects(textpage, start, -1));
1232     EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 0));
1233     EXPECT_EQ(1, FPDFText_CountRects(textpage, start, 1));
1234     EXPECT_EQ(1, FPDFText_CountRects(textpage, start, 2));
1235     EXPECT_EQ(1, FPDFText_CountRects(textpage, start, 500));
1236   }
1237 
1238   // Now test start values that starts beyond the end of the text.
1239   for (int start = kExpectedLength; start < 100; ++start) {
1240     EXPECT_EQ(0, FPDFText_CountRects(textpage, start, -1));
1241     EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 0));
1242     EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 1));
1243     EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 2));
1244     EXPECT_EQ(0, FPDFText_CountRects(textpage, start, 500));
1245   }
1246 
1247   FPDFText_ClosePage(textpage);
1248   UnloadPage(page);
1249 }
1250 
TEST_F(FPDFTextEmbedderTest,GetText)1251 TEST_F(FPDFTextEmbedderTest, GetText) {
1252   ASSERT_TRUE(OpenDocument("hello_world.pdf"));
1253   FPDF_PAGE page = LoadPage(0);
1254   ASSERT_TRUE(page);
1255 
1256   FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1257   ASSERT_TRUE(text_page);
1258 
1259   EXPECT_EQ(2, FPDFPage_CountObjects(page));
1260   FPDF_PAGEOBJECT text_object = FPDFPage_GetObject(page, 0);
1261   ASSERT_TRUE(text_object);
1262 
1263   // Positive testing.
1264   constexpr char kHelloText[] = "Hello, world!";
1265   // Return value includes the terminating NUL that is provided.
1266   constexpr unsigned long kHelloUTF16Size = std::size(kHelloText) * 2;
1267   constexpr wchar_t kHelloWideText[] = L"Hello, world!";
1268   unsigned long size = FPDFTextObj_GetText(text_object, text_page, nullptr, 0);
1269   ASSERT_EQ(kHelloUTF16Size, size);
1270 
1271   std::vector<unsigned short> buffer(size);
1272   ASSERT_EQ(size,
1273             FPDFTextObj_GetText(text_object, text_page, buffer.data(), size));
1274   ASSERT_EQ(kHelloWideText, GetPlatformWString(buffer.data()));
1275 
1276   // Negative testing.
1277   ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, text_page, nullptr, 0));
1278   ASSERT_EQ(0U, FPDFTextObj_GetText(text_object, nullptr, nullptr, 0));
1279   ASSERT_EQ(0U, FPDFTextObj_GetText(nullptr, nullptr, nullptr, 0));
1280 
1281   // Buffer is too small, ensure it's not modified.
1282   buffer.resize(2);
1283   buffer[0] = 'x';
1284   buffer[1] = '\0';
1285   size =
1286       FPDFTextObj_GetText(text_object, text_page, buffer.data(), buffer.size());
1287   ASSERT_EQ(kHelloUTF16Size, size);
1288   ASSERT_EQ('x', buffer[0]);
1289   ASSERT_EQ('\0', buffer[1]);
1290 
1291   FPDFText_ClosePage(text_page);
1292   UnloadPage(page);
1293 }
1294 
TEST_F(FPDFTextEmbedderTest,CroppedText)1295 TEST_F(FPDFTextEmbedderTest, CroppedText) {
1296   static constexpr int kPageCount = 4;
1297   static constexpr FS_RECTF kBoxes[kPageCount] = {
1298       {50.0f, 150.0f, 150.0f, 50.0f},
1299       {50.0f, 150.0f, 150.0f, 50.0f},
1300       {60.0f, 150.0f, 150.0f, 60.0f},
1301       {60.0f, 150.0f, 150.0f, 60.0f},
1302   };
1303   static constexpr const char* kExpectedText[kPageCount] = {
1304       " world!\r\ndbye, world!",
1305       " world!\r\ndbye, world!",
1306       "bye, world!",
1307       "bye, world!",
1308   };
1309 
1310   ASSERT_TRUE(OpenDocument("cropped_text.pdf"));
1311   ASSERT_EQ(kPageCount, FPDF_GetPageCount(document()));
1312 
1313   for (int i = 0; i < kPageCount; ++i) {
1314     FPDF_PAGE page = LoadPage(i);
1315     ASSERT_TRUE(page);
1316 
1317     FS_RECTF box;
1318     EXPECT_TRUE(FPDF_GetPageBoundingBox(page, &box));
1319     EXPECT_EQ(kBoxes[i].left, box.left);
1320     EXPECT_EQ(kBoxes[i].top, box.top);
1321     EXPECT_EQ(kBoxes[i].right, box.right);
1322     EXPECT_EQ(kBoxes[i].bottom, box.bottom);
1323 
1324     {
1325       ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
1326       ASSERT_TRUE(textpage);
1327 
1328       unsigned short buffer[128];
1329       memset(buffer, 0xbd, sizeof(buffer));
1330       int num_chars = FPDFText_GetText(textpage.get(), 0, 128, buffer);
1331       ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
1332       EXPECT_TRUE(check_unsigned_shorts(kHelloGoodbyeText, buffer,
1333                                         kHelloGoodbyeTextSize));
1334 
1335       int expected_char_count = strlen(kExpectedText[i]);
1336       ASSERT_EQ(expected_char_count,
1337                 FPDFText_GetBoundedText(textpage.get(), box.left, box.top,
1338                                         box.right, box.bottom, nullptr, 0));
1339 
1340       memset(buffer, 0xbd, sizeof(buffer));
1341       ASSERT_EQ(expected_char_count + 1,
1342                 FPDFText_GetBoundedText(textpage.get(), box.left, box.top,
1343                                         box.right, box.bottom, buffer, 128));
1344       EXPECT_TRUE(
1345           check_unsigned_shorts(kExpectedText[i], buffer, expected_char_count));
1346     }
1347 
1348     UnloadPage(page);
1349   }
1350 }
1351 
TEST_F(FPDFTextEmbedderTest,Bug_1139)1352 TEST_F(FPDFTextEmbedderTest, Bug_1139) {
1353   ASSERT_TRUE(OpenDocument("bug_1139.pdf"));
1354   FPDF_PAGE page = LoadPage(0);
1355   ASSERT_TRUE(page);
1356 
1357   FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1358   ASSERT_TRUE(text_page);
1359 
1360   // -1 for CountChars not including the \0, but +1 for the extra control
1361   // character.
1362   EXPECT_EQ(kHelloGoodbyeTextSize, FPDFText_CountChars(text_page));
1363 
1364   // There is an extra control character at the beginning of the string, but it
1365   // should not appear in the output nor prevent extracting the text.
1366   unsigned short buffer[128];
1367   int num_chars = FPDFText_GetText(text_page, 0, 128, buffer);
1368   ASSERT_EQ(kHelloGoodbyeTextSize, num_chars);
1369   EXPECT_TRUE(
1370       check_unsigned_shorts(kHelloGoodbyeText, buffer, kHelloGoodbyeTextSize));
1371   FPDFText_ClosePage(text_page);
1372   UnloadPage(page);
1373 }
1374 
TEST_F(FPDFTextEmbedderTest,Bug_642)1375 TEST_F(FPDFTextEmbedderTest, Bug_642) {
1376   ASSERT_TRUE(OpenDocument("bug_642.pdf"));
1377   FPDF_PAGE page = LoadPage(0);
1378   ASSERT_TRUE(page);
1379   {
1380     ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1381     ASSERT_TRUE(text_page);
1382 
1383     constexpr char kText[] = "ABCD";
1384     constexpr size_t kTextSize = std::size(kText);
1385     // -1 for CountChars not including the \0
1386     EXPECT_EQ(static_cast<int>(kTextSize) - 1,
1387               FPDFText_CountChars(text_page.get()));
1388 
1389     unsigned short buffer[kTextSize];
1390     int num_chars =
1391         FPDFText_GetText(text_page.get(), 0, std::size(buffer) - 1, buffer);
1392     ASSERT_EQ(static_cast<int>(kTextSize), num_chars);
1393     EXPECT_TRUE(check_unsigned_shorts(kText, buffer, kTextSize));
1394   }
1395 
1396   UnloadPage(page);
1397 }
1398 
TEST_F(FPDFTextEmbedderTest,GetCharAngle)1399 TEST_F(FPDFTextEmbedderTest, GetCharAngle) {
1400   ASSERT_TRUE(OpenDocument("rotated_text.pdf"));
1401   FPDF_PAGE page = LoadPage(0);
1402   ASSERT_TRUE(page);
1403 
1404   FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1405   ASSERT_TRUE(text_page);
1406 
1407   static constexpr int kSubstringsSize[] = {
1408       std::size("Hello,"), std::size(" world!\r\n"), std::size("Goodbye,")};
1409 
1410   // -1 for CountChars not including the \0, but +1 for the extra control
1411   // character.
1412   EXPECT_EQ(kHelloGoodbyeTextSize, FPDFText_CountChars(text_page));
1413 
1414   EXPECT_FLOAT_EQ(-1.0f, FPDFText_GetCharAngle(nullptr, 0));
1415   EXPECT_FLOAT_EQ(-1.0f, FPDFText_GetCharAngle(text_page, -1));
1416   EXPECT_FLOAT_EQ(-1.0f,
1417                   FPDFText_GetCharAngle(text_page, kHelloGoodbyeTextSize + 1));
1418 
1419   // Test GetCharAngle for every quadrant
1420   EXPECT_NEAR(FXSYS_PI / 4.0, FPDFText_GetCharAngle(text_page, 0), 0.001);
1421   EXPECT_NEAR(3 * FXSYS_PI / 4.0,
1422               FPDFText_GetCharAngle(text_page, kSubstringsSize[0]), 0.001);
1423   EXPECT_NEAR(
1424       5 * FXSYS_PI / 4.0,
1425       FPDFText_GetCharAngle(text_page, kSubstringsSize[0] + kSubstringsSize[1]),
1426       0.001);
1427   EXPECT_NEAR(
1428       7 * FXSYS_PI / 4.0,
1429       FPDFText_GetCharAngle(text_page, kSubstringsSize[0] + kSubstringsSize[1] +
1430                                            kSubstringsSize[2]),
1431       0.001);
1432 
1433   FPDFText_ClosePage(text_page);
1434   UnloadPage(page);
1435 }
1436 
TEST_F(FPDFTextEmbedderTest,GetFontWeight)1437 TEST_F(FPDFTextEmbedderTest, GetFontWeight) {
1438   ASSERT_TRUE(OpenDocument("font_weight.pdf"));
1439   FPDF_PAGE page = LoadPage(0);
1440   ASSERT_TRUE(page);
1441 
1442   FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1443   ASSERT_TRUE(text_page);
1444 
1445   EXPECT_EQ(2, FPDFText_CountChars(text_page));
1446 
1447   EXPECT_EQ(-1, FPDFText_GetFontWeight(nullptr, 0));
1448   EXPECT_EQ(-1, FPDFText_GetFontWeight(text_page, -1));
1449   EXPECT_EQ(-1, FPDFText_GetFontWeight(text_page, 314));
1450 
1451   // The font used for this text only specifies /StemV (80); the weight value
1452   // that is returned should be calculated from that (80*5 == 400).
1453   EXPECT_EQ(400, FPDFText_GetFontWeight(text_page, 0));
1454 
1455   // Using a /StemV value of 82, the estimate comes out to 410, even though
1456   // /FontWeight is 400.
1457   // TODO(crbug.com/pdfium/1420): Fix this the return value here.
1458   EXPECT_EQ(410, FPDFText_GetFontWeight(text_page, 1));
1459 
1460   FPDFText_ClosePage(text_page);
1461   UnloadPage(page);
1462 }
1463 
TEST_F(FPDFTextEmbedderTest,GetTextRenderMode)1464 TEST_F(FPDFTextEmbedderTest, GetTextRenderMode) {
1465   ASSERT_TRUE(OpenDocument("text_render_mode.pdf"));
1466   FPDF_PAGE page = LoadPage(0);
1467   ASSERT_TRUE(page);
1468 
1469   FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1470   ASSERT_TRUE(text_page);
1471 
1472   ASSERT_EQ(12, FPDFText_CountChars(text_page));
1473 
1474   ASSERT_EQ(FPDF_TEXTRENDERMODE_UNKNOWN,
1475             FPDFText_GetTextRenderMode(nullptr, 0));
1476   ASSERT_EQ(FPDF_TEXTRENDERMODE_UNKNOWN,
1477             FPDFText_GetTextRenderMode(text_page, -1));
1478   ASSERT_EQ(FPDF_TEXTRENDERMODE_UNKNOWN,
1479             FPDFText_GetTextRenderMode(text_page, 314));
1480 
1481   ASSERT_EQ(FPDF_TEXTRENDERMODE_FILL, FPDFText_GetTextRenderMode(text_page, 0));
1482 
1483   ASSERT_EQ(FPDF_TEXTRENDERMODE_STROKE,
1484             FPDFText_GetTextRenderMode(text_page, 7));
1485 
1486   FPDFText_ClosePage(text_page);
1487   UnloadPage(page);
1488 }
1489 
TEST_F(FPDFTextEmbedderTest,GetFillColor)1490 TEST_F(FPDFTextEmbedderTest, GetFillColor) {
1491   ASSERT_TRUE(OpenDocument("text_color.pdf"));
1492   FPDF_PAGE page = LoadPage(0);
1493   ASSERT_TRUE(page);
1494 
1495   FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1496   ASSERT_TRUE(text_page);
1497 
1498   ASSERT_EQ(1, FPDFText_CountChars(text_page));
1499 
1500   ASSERT_FALSE(
1501       FPDFText_GetFillColor(nullptr, 0, nullptr, nullptr, nullptr, nullptr));
1502   ASSERT_FALSE(
1503       FPDFText_GetFillColor(text_page, -1, nullptr, nullptr, nullptr, nullptr));
1504   ASSERT_FALSE(FPDFText_GetFillColor(text_page, 314, nullptr, nullptr, nullptr,
1505                                      nullptr));
1506   ASSERT_FALSE(
1507       FPDFText_GetFillColor(text_page, 0, nullptr, nullptr, nullptr, nullptr));
1508 
1509   unsigned int r;
1510   unsigned int g;
1511   unsigned int b;
1512   unsigned int a;
1513   ASSERT_TRUE(FPDFText_GetFillColor(text_page, 0, &r, &g, &b, &a));
1514   ASSERT_EQ(0xffu, r);
1515   ASSERT_EQ(0u, g);
1516   ASSERT_EQ(0u, b);
1517   ASSERT_EQ(0xffu, a);
1518 
1519   FPDFText_ClosePage(text_page);
1520   UnloadPage(page);
1521 }
1522 
TEST_F(FPDFTextEmbedderTest,GetStrokeColor)1523 TEST_F(FPDFTextEmbedderTest, GetStrokeColor) {
1524   ASSERT_TRUE(OpenDocument("text_color.pdf"));
1525   FPDF_PAGE page = LoadPage(0);
1526   ASSERT_TRUE(page);
1527 
1528   FPDF_TEXTPAGE text_page = FPDFText_LoadPage(page);
1529   ASSERT_TRUE(text_page);
1530 
1531   ASSERT_EQ(1, FPDFText_CountChars(text_page));
1532 
1533   ASSERT_FALSE(
1534       FPDFText_GetStrokeColor(nullptr, 0, nullptr, nullptr, nullptr, nullptr));
1535   ASSERT_FALSE(FPDFText_GetStrokeColor(text_page, -1, nullptr, nullptr, nullptr,
1536                                        nullptr));
1537   ASSERT_FALSE(FPDFText_GetStrokeColor(text_page, 314, nullptr, nullptr,
1538                                        nullptr, nullptr));
1539   ASSERT_FALSE(FPDFText_GetStrokeColor(text_page, 0, nullptr, nullptr, nullptr,
1540                                        nullptr));
1541 
1542   unsigned int r;
1543   unsigned int g;
1544   unsigned int b;
1545   unsigned int a;
1546   ASSERT_TRUE(FPDFText_GetStrokeColor(text_page, 0, &r, &g, &b, &a));
1547   ASSERT_EQ(0u, r);
1548   ASSERT_EQ(0xffu, g);
1549   ASSERT_EQ(0u, b);
1550   ASSERT_EQ(0xffu, a);
1551 
1552   FPDFText_ClosePage(text_page);
1553   UnloadPage(page);
1554 }
1555 
TEST_F(FPDFTextEmbedderTest,GetMatrix)1556 TEST_F(FPDFTextEmbedderTest, GetMatrix) {
1557   constexpr char kExpectedText[] = "A1\r\nA2\r\nA3";
1558   constexpr size_t kExpectedTextSize = std::size(kExpectedText);
1559   constexpr FS_MATRIX kExpectedMatrices[] = {
1560       {12.0f, 0.0f, 0.0f, 10.0f, 66.0f, 90.0f},
1561       {12.0f, 0.0f, 0.0f, 10.0f, 66.0f, 90.0f},
1562       {1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f},
1563       {1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f},
1564       {12.0f, 0.0f, 0.0f, 10.0f, 38.0f, 60.0f},
1565       {12.0f, 0.0f, 0.0f, 10.0f, 38.0f, 60.0f},
1566       {1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f},
1567       {1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f},
1568       {1.0f, 0.0f, 0.0f, 0.833333, 60.0f, 130.0f},
1569       {1.0f, 0.0f, 0.0f, 0.833333, 60.0f, 130.0f},
1570   };
1571   constexpr size_t kExpectedCount = std::size(kExpectedMatrices);
1572   static_assert(kExpectedCount + 1 == kExpectedTextSize,
1573                 "Bad expected matrix size");
1574 
1575   ASSERT_TRUE(OpenDocument("font_matrix.pdf"));
1576   FPDF_PAGE page = LoadPage(0);
1577   ASSERT_TRUE(page);
1578 
1579   {
1580     ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1581     ASSERT_TRUE(text_page);
1582     ASSERT_EQ(static_cast<int>(kExpectedCount),
1583               FPDFText_CountChars(text_page.get()));
1584 
1585     {
1586       // Check the characters.
1587       unsigned short buffer[kExpectedTextSize];
1588       ASSERT_EQ(static_cast<int>(kExpectedTextSize),
1589                 FPDFText_GetText(text_page.get(), 0, kExpectedCount, buffer));
1590       EXPECT_TRUE(
1591           check_unsigned_shorts(kExpectedText, buffer, kExpectedTextSize));
1592     }
1593 
1594     // Check the character matrix.
1595     FS_MATRIX matrix;
1596     for (size_t i = 0; i < kExpectedCount; ++i) {
1597       ASSERT_TRUE(FPDFText_GetMatrix(text_page.get(), i, &matrix)) << i;
1598       EXPECT_FLOAT_EQ(kExpectedMatrices[i].a, matrix.a) << i;
1599       EXPECT_FLOAT_EQ(kExpectedMatrices[i].b, matrix.b) << i;
1600       EXPECT_FLOAT_EQ(kExpectedMatrices[i].c, matrix.c) << i;
1601       EXPECT_FLOAT_EQ(kExpectedMatrices[i].d, matrix.d) << i;
1602       EXPECT_FLOAT_EQ(kExpectedMatrices[i].e, matrix.e) << i;
1603       EXPECT_FLOAT_EQ(kExpectedMatrices[i].f, matrix.f) << i;
1604     }
1605 
1606     // Check bad parameters.
1607     EXPECT_FALSE(FPDFText_GetMatrix(nullptr, 0, &matrix));
1608     EXPECT_FALSE(FPDFText_GetMatrix(text_page.get(), 10, &matrix));
1609     EXPECT_FALSE(FPDFText_GetMatrix(text_page.get(), -1, &matrix));
1610     EXPECT_FALSE(FPDFText_GetMatrix(text_page.get(), 0, nullptr));
1611   }
1612 
1613   UnloadPage(page);
1614 }
1615 
TEST_F(FPDFTextEmbedderTest,CharBox)1616 TEST_F(FPDFTextEmbedderTest, CharBox) {
1617   // For a size 12 letter 'A'.
1618   constexpr double kExpectedCharWidth = 8.460;
1619   constexpr double kExpectedCharHeight = 6.600;
1620   constexpr float kExpectedLooseCharWidth = 8.664f;
1621   constexpr float kExpectedLooseCharHeight = 12.0f;
1622 
1623   ASSERT_TRUE(OpenDocument("font_matrix.pdf"));
1624   FPDF_PAGE page = LoadPage(0);
1625   ASSERT_TRUE(page);
1626 
1627   {
1628     ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1629     ASSERT_TRUE(text_page);
1630 
1631     // Check the character box size.
1632     double left;
1633     double right;
1634     double bottom;
1635     double top;
1636     ASSERT_TRUE(
1637         FPDFText_GetCharBox(text_page.get(), 0, &left, &right, &bottom, &top));
1638     EXPECT_NEAR(kExpectedCharWidth, right - left, 0.001);
1639     EXPECT_NEAR(kExpectedCharHeight, top - bottom, 0.001);
1640     ASSERT_TRUE(
1641         FPDFText_GetCharBox(text_page.get(), 4, &left, &right, &bottom, &top));
1642     EXPECT_NEAR(kExpectedCharWidth, right - left, 0.001);
1643     EXPECT_NEAR(kExpectedCharHeight, top - bottom, 0.001);
1644     ASSERT_TRUE(
1645         FPDFText_GetCharBox(text_page.get(), 8, &left, &right, &bottom, &top));
1646     EXPECT_NEAR(kExpectedCharWidth, right - left, 0.001);
1647     EXPECT_NEAR(kExpectedCharHeight, top - bottom, 0.001);
1648 
1649     // Check the loose character box size.
1650     FS_RECTF rect;
1651     ASSERT_TRUE(FPDFText_GetLooseCharBox(text_page.get(), 0, &rect));
1652     EXPECT_FLOAT_EQ(kExpectedLooseCharWidth, rect.right - rect.left);
1653     EXPECT_FLOAT_EQ(kExpectedLooseCharHeight, rect.top - rect.bottom);
1654     ASSERT_TRUE(FPDFText_GetLooseCharBox(text_page.get(), 4, &rect));
1655     EXPECT_FLOAT_EQ(kExpectedLooseCharWidth, rect.right - rect.left);
1656     EXPECT_FLOAT_EQ(kExpectedLooseCharHeight, rect.top - rect.bottom);
1657     ASSERT_TRUE(FPDFText_GetLooseCharBox(text_page.get(), 8, &rect));
1658     EXPECT_FLOAT_EQ(kExpectedLooseCharWidth, rect.right - rect.left);
1659     EXPECT_NEAR(kExpectedLooseCharHeight, rect.top - rect.bottom, 0.00001);
1660   }
1661 
1662   UnloadPage(page);
1663 }
1664 
TEST_F(FPDFTextEmbedderTest,SmallType3Glyph)1665 TEST_F(FPDFTextEmbedderTest, SmallType3Glyph) {
1666   ASSERT_TRUE(OpenDocument("bug_1591.pdf"));
1667   FPDF_PAGE page = LoadPage(0);
1668   ASSERT_TRUE(page);
1669 
1670   {
1671     ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1672     ASSERT_TRUE(text_page);
1673     ASSERT_EQ(5, FPDFText_CountChars(text_page.get()));
1674 
1675     EXPECT_EQ(49u, FPDFText_GetUnicode(text_page.get(), 0));
1676     EXPECT_EQ(32u, FPDFText_GetUnicode(text_page.get(), 1));
1677     EXPECT_EQ(50u, FPDFText_GetUnicode(text_page.get(), 2));
1678     EXPECT_EQ(32u, FPDFText_GetUnicode(text_page.get(), 3));
1679     EXPECT_EQ(49u, FPDFText_GetUnicode(text_page.get(), 4));
1680 
1681     // Check the character box size.
1682     double left;
1683     double right;
1684     double bottom;
1685     double top;
1686     ASSERT_TRUE(
1687         FPDFText_GetCharBox(text_page.get(), 0, &left, &right, &bottom, &top));
1688     EXPECT_DOUBLE_EQ(63.439998626708984, left);
1689     EXPECT_DOUBLE_EQ(65.360000610351562, right);
1690     EXPECT_DOUBLE_EQ(50.0, bottom);
1691     EXPECT_DOUBLE_EQ(61.520000457763672, top);
1692     ASSERT_TRUE(
1693         FPDFText_GetCharBox(text_page.get(), 1, &left, &right, &bottom, &top));
1694     EXPECT_DOUBLE_EQ(62.007999420166016, left);
1695     EXPECT_DOUBLE_EQ(62.007999420166016, right);
1696     EXPECT_DOUBLE_EQ(50.0, bottom);
1697     EXPECT_DOUBLE_EQ(50.0, top);
1698     ASSERT_TRUE(
1699         FPDFText_GetCharBox(text_page.get(), 2, &left, &right, &bottom, &top));
1700     EXPECT_DOUBLE_EQ(86.0, left);
1701     EXPECT_DOUBLE_EQ(88.400001525878906, right);
1702     EXPECT_DOUBLE_EQ(50.0, bottom);
1703     EXPECT_DOUBLE_EQ(50.240001678466797, top);
1704     ASSERT_TRUE(
1705         FPDFText_GetCharBox(text_page.get(), 3, &left, &right, &bottom, &top));
1706     EXPECT_DOUBLE_EQ(86.010002136230469, left);
1707     EXPECT_DOUBLE_EQ(86.010002136230469, right);
1708     EXPECT_DOUBLE_EQ(50.0, bottom);
1709     EXPECT_DOUBLE_EQ(50.0, top);
1710     ASSERT_TRUE(
1711         FPDFText_GetCharBox(text_page.get(), 4, &left, &right, &bottom, &top));
1712     EXPECT_DOUBLE_EQ(99.44000244140625, left);
1713     EXPECT_DOUBLE_EQ(101.36000061035156, right);
1714     EXPECT_DOUBLE_EQ(50.0, bottom);
1715     EXPECT_DOUBLE_EQ(61.520000457763672, top);
1716   }
1717 
1718   UnloadPage(page);
1719 }
1720 
TEST_F(FPDFTextEmbedderTest,BigtableTextExtraction)1721 TEST_F(FPDFTextEmbedderTest, BigtableTextExtraction) {
1722   constexpr char kExpectedText[] =
1723       "{fay,jeff,sanjay,wilsonh,kerr,m3b,tushar,\x02k es,gruber}@google.com";
1724   constexpr int kExpectedTextCount = std::size(kExpectedText) - 1;
1725 
1726   ASSERT_TRUE(OpenDocument("bigtable_mini.pdf"));
1727   FPDF_PAGE page = LoadPage(0);
1728   ASSERT_TRUE(page);
1729 
1730   {
1731     ScopedFPDFTextPage text_page(FPDFText_LoadPage(page));
1732     ASSERT_TRUE(text_page);
1733     int char_count = FPDFText_CountChars(text_page.get());
1734     ASSERT_GE(char_count, 0);
1735     ASSERT_EQ(kExpectedTextCount, char_count);
1736 
1737     for (int i = 0; i < kExpectedTextCount; ++i) {
1738       EXPECT_EQ(static_cast<uint32_t>(kExpectedText[i]),
1739                 FPDFText_GetUnicode(text_page.get(), i));
1740     }
1741   }
1742 
1743   UnloadPage(page);
1744 }
1745 
TEST_F(FPDFTextEmbedderTest,Bug1769)1746 TEST_F(FPDFTextEmbedderTest, Bug1769) {
1747   ASSERT_TRUE(OpenDocument("bug_1769.pdf"));
1748   FPDF_PAGE page = LoadPage(0);
1749   ASSERT_TRUE(page);
1750 
1751   {
1752     ScopedFPDFTextPage textpage(FPDFText_LoadPage(page));
1753     ASSERT_TRUE(textpage);
1754 
1755     unsigned short buffer[128] = {};
1756     // TODO(crbug.com/pdfium/1769): Improve text extraction.
1757     // The first instance of "world" is visible to the human eye and should be
1758     // extracted as is. The second instance is not, so how it should be
1759     // extracted is debatable.
1760     ASSERT_EQ(10, FPDFText_GetText(textpage.get(), 0, 128, buffer));
1761     EXPECT_TRUE(check_unsigned_shorts("wo d wo d", buffer, 10));
1762   }
1763 
1764   UnloadPage(page);
1765 }
1766