xref: /aosp_15_r20/external/pigweed/pw_tokenizer/detokenize_test.cc (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 #include "pw_tokenizer/detokenize.h"
16 
17 #include <string>
18 #include <string_view>
19 
20 #include "pw_stream/memory_stream.h"
21 #include "pw_tokenizer/example_binary_with_tokenized_strings.h"
22 #include "pw_unit_test/framework.h"
23 
24 namespace pw::tokenizer {
25 namespace {
26 
27 using namespace std::literals::string_view_literals;
28 
29 // Use a shorter name for the error string macro.
30 #define ERR PW_TOKENIZER_ARG_DECODING_ERROR
31 
32 using Case = std::pair<std::string_view, std::string_view>;
33 
34 template <typename... Args>
TestCases(Args...args)35 auto TestCases(Args... args) {
36   return std::array<Case, sizeof...(Args)>{args...};
37 }
38 
39 // Database with the following entries and arbitrary token values:
40 // {
41 //   0x00000001: "One",
42 //   0x00000005: "TWO",
43 //   0x000000ff: "333",
44 //   0xDDEEEEFF: "One",
45 //   0xEEEEEEEE: "$AQAAAA==",  # Nested Base64 token for "One"
46 // }
47 constexpr char kTestDatabase[] =
48     "TOKENS\0\0"
49     "\x06\x00\x00\x00"  // Number of tokens in this database.
50     "\0\0\0\0"
51     "\x01\x00\x00\x00----"
52     "\x05\x00\x00\x00----"
53     "\xFF\x00\x00\x00----"
54     "\xFF\xEE\xEE\xDD----"
55     "\xEE\xEE\xEE\xEE----"
56     "\x9D\xA7\x97\xF8----"
57     "One\0"
58     "TWO\0"
59     "333\0"
60     "FOUR\0"
61     "$AQAAAA==\0"
62     "■msg♦This is $AQAAAA== message■module♦■file♦file.txt";
63 
64 class Detokenize : public ::testing::Test {
65  protected:
Detokenize()66   Detokenize() : detok_(TokenDatabase::Create<kTestDatabase>()) {}
67   Detokenizer detok_;
68 };
69 
TEST_F(Detokenize,NoFormatting)70 TEST_F(Detokenize, NoFormatting) {
71   EXPECT_EQ(detok_.Detokenize("\1\0\0\0"sv).BestString(), "One");
72   EXPECT_EQ(detok_.Detokenize("\5\0\0\0"sv).BestString(), "TWO");
73   EXPECT_EQ(detok_.Detokenize("\xff\x00\x00\x00"sv).BestString(), "333");
74   EXPECT_EQ(detok_.Detokenize("\xff\xee\xee\xdd"sv).BestString(), "FOUR");
75 }
76 
TEST_F(Detokenize,FromElfSection)77 TEST_F(Detokenize, FromElfSection) {
78   // Create a detokenizer from an ELF file with only the pw_tokenizer sections.
79   // See py/detokenize_test.py.
80   // Offset and size of the .pw_tokenizer.entries section in bytes.
81   constexpr uint32_t database_offset_ = 0x00000174;
82   constexpr size_t database_size_ = 0x000004C2;
83 
84   pw::span<const uint8_t> tokenEntries(
85       reinterpret_cast<const uint8_t*>(::test::ns::kElfSection.data() +
86                                        database_offset_),
87       database_size_);
88   pw::Result<Detokenizer> detok_from_elf_ =
89       Detokenizer::FromElfSection(tokenEntries);
90   ASSERT_TRUE(detok_from_elf_.ok());
91   EXPECT_EQ(detok_from_elf_->Detokenize("\xd6\x8c\x66\x2e").BestString(),
92             "Jello, world!");
93 }
94 
TEST_F(Detokenize,FromElfFile)95 TEST_F(Detokenize, FromElfFile) {
96   // Create a detokenizer from an ELF file with only the pw_tokenizer sections.
97   // See py/detokenize_test.py.
98   stream::MemoryReader stream(::test::ns::kElfSection);
99 
100   pw::Result<Detokenizer> detok = Detokenizer::FromElfFile(stream);
101   PW_TEST_ASSERT_OK(detok);
102   EXPECT_EQ(detok->Detokenize("\xd6\x8c\x66\x2e").BestString(),
103             "Jello, world!");
104 }
105 
TEST_F(Detokenize,BestString_MissingToken_IsEmpty)106 TEST_F(Detokenize, BestString_MissingToken_IsEmpty) {
107   EXPECT_FALSE(detok_.Detokenize("").ok());
108   EXPECT_TRUE(detok_.Detokenize("", 0u).BestString().empty());
109 }
110 
TEST_F(Detokenize,BestString_ShorterToken_ZeroExtended)111 TEST_F(Detokenize, BestString_ShorterToken_ZeroExtended) {
112   EXPECT_EQ(detok_.Detokenize("\x42", 1u).token(), 0x42u);
113   EXPECT_EQ(detok_.Detokenize("\1\0"sv).token(), 0x1u);
114   EXPECT_EQ(detok_.Detokenize("\1\0\3"sv).token(), 0x030001u);
115   EXPECT_EQ(detok_.Detokenize("\0\0\0"sv).token(), 0x0u);
116 }
117 
TEST_F(Detokenize,BestString_UnknownToken_IsEmpty)118 TEST_F(Detokenize, BestString_UnknownToken_IsEmpty) {
119   EXPECT_FALSE(detok_.Detokenize("\0\0\0\0"sv).ok());
120   EXPECT_TRUE(detok_.Detokenize("\0\0\0\0"sv).BestString().empty());
121   EXPECT_TRUE(detok_.Detokenize("\2\0\0\0"sv).BestString().empty());
122   EXPECT_TRUE(detok_.Detokenize("\x10\x32\x54\x76\x99"sv).BestString().empty());
123   EXPECT_TRUE(detok_.Detokenize("\x98\xba\xdc\xfe"sv).BestString().empty());
124 }
125 
TEST_F(Detokenize,BestStringWithErrors_MissingToken_ErrorMessage)126 TEST_F(Detokenize, BestStringWithErrors_MissingToken_ErrorMessage) {
127   EXPECT_FALSE(detok_.Detokenize("").ok());
128   EXPECT_EQ(detok_.Detokenize("", 0u).BestStringWithErrors(),
129             ERR("missing token"));
130 }
131 
TEST_F(Detokenize,BestStringWithErrors_ShorterTokenMatchesStrings)132 TEST_F(Detokenize, BestStringWithErrors_ShorterTokenMatchesStrings) {
133   EXPECT_EQ(detok_.Detokenize("\1", 1u).BestStringWithErrors(), "One");
134   EXPECT_EQ(detok_.Detokenize("\1\0"sv).BestStringWithErrors(), "One");
135   EXPECT_EQ(detok_.Detokenize("\1\0\0"sv).BestStringWithErrors(), "One");
136 }
137 
TEST_F(Detokenize,BestStringWithErrors_UnknownToken_ErrorMessage)138 TEST_F(Detokenize, BestStringWithErrors_UnknownToken_ErrorMessage) {
139   ASSERT_FALSE(detok_.Detokenize("\0\0\0\0"sv).ok());
140   EXPECT_EQ(detok_.Detokenize("\0"sv).BestStringWithErrors(),
141             ERR("unknown token 00000000"));
142   EXPECT_EQ(detok_.Detokenize("\0\0\0"sv).BestStringWithErrors(),
143             ERR("unknown token 00000000"));
144   EXPECT_EQ(detok_.Detokenize("\0\0\0\0"sv).BestStringWithErrors(),
145             ERR("unknown token 00000000"));
146   EXPECT_EQ(detok_.Detokenize("\2\0\0\0"sv).BestStringWithErrors(),
147             ERR("unknown token 00000002"));
148   EXPECT_EQ(detok_.Detokenize("\x10\x32\x54\x76\x99"sv).BestStringWithErrors(),
149             ERR("unknown token 76543210"));
150   EXPECT_EQ(detok_.Detokenize("\x98\xba\xdc\xfe"sv).BestStringWithErrors(),
151             ERR("unknown token fedcba98"));
152 }
153 
154 // Base64 versions of the tokens
155 #define ONE "$AQAAAA=="
156 #define TWO "$BQAAAA=="
157 #define THREE "$/wAAAA=="
158 #define FOUR "$/+7u3Q=="
159 #define NEST_ONE "$7u7u7g=="
160 
TEST_F(Detokenize,Base64_NoArguments)161 TEST_F(Detokenize, Base64_NoArguments) {
162   for (auto [data, expected] : TestCases(
163            Case{ONE, "One"},
164            Case{TWO, "TWO"},
165            Case{THREE, "333"},
166            Case{FOUR, "FOUR"},
167            Case{FOUR ONE ONE, "FOUROneOne"},
168            Case{ONE TWO THREE FOUR, "OneTWO333FOUR"},
169            Case{ONE "\r\n" TWO "\r\n" THREE "\r\n" FOUR "\r\n",
170                 "One\r\nTWO\r\n333\r\nFOUR\r\n"},
171            Case{"123" FOUR, "123FOUR"},
172            Case{"123" FOUR ", 56", "123FOUR, 56"},
173            Case{"12" THREE FOUR ", 56", "12333FOUR, 56"},
174            Case{"$0" ONE, "$0One"},
175            Case{"$/+7u3Q=", "$/+7u3Q="},  // incomplete message (missing "=")
176            Case{"$123456==" FOUR, "$123456==FOUR"},
177            Case{NEST_ONE, "One"},
178            Case{NEST_ONE NEST_ONE NEST_ONE, "OneOneOne"},
179            Case{FOUR "$" ONE NEST_ONE "?", "FOUR$OneOne?"})) {
180     EXPECT_EQ(detok_.DetokenizeText(data), expected);
181   }
182 }
183 
TEST_F(Detokenize,OptionallyTokenizedData)184 TEST_F(Detokenize, OptionallyTokenizedData) {
185   for (auto [data, expected] : TestCases(
186            Case{ONE, "One"},
187            Case{"\1\0\0\0", "One"},
188            Case{TWO, "TWO"},
189            Case{THREE, "333"},
190            Case{FOUR, "FOUR"},
191            Case{FOUR ONE ONE, "FOUROneOne"},
192            Case{ONE TWO THREE FOUR, "OneTWO333FOUR"},
193            Case{ONE "\r\n" TWO "\r\n" THREE "\r\n" FOUR "\r\n",
194                 "One\r\nTWO\r\n333\r\nFOUR\r\n"},
195            Case{"123" FOUR, "123FOUR"},
196            Case{"123" FOUR ", 56", "123FOUR, 56"},
197            Case{"12" THREE FOUR ", 56", "12333FOUR, 56"},
198            Case{"$0" ONE, "$0One"},
199            Case{"$/+7u3Q=", "$/+7u3Q="},  // incomplete message (missing "=")
200            Case{"$123456==" FOUR, "$123456==FOUR"},
201            Case{NEST_ONE, "One"},
202            Case{NEST_ONE NEST_ONE NEST_ONE, "OneOneOne"},
203            Case{FOUR "$" ONE NEST_ONE "?", "FOUR$OneOne?"},
204            Case{"$naeX+A==",
205                 "■msg♦This is One message■module♦■file♦file.txt"})) {
206     EXPECT_EQ(detok_.DecodeOptionallyTokenizedData(as_bytes(span(data))),
207               std::string(expected));
208   }
209 }
210 
211 constexpr char kDataWithArguments[] =
212     "TOKENS\0\0"
213     "\x09\x00\x00\x00"
214     "\0\0\0\0"
215     "\x00\x00\x00\x00----"
216     "\x0A\x0B\x0C\x0D----"
217     "\x0E\x0F\x00\x01----"
218     "\xAA\xAA\xAA\xAA----"
219     "\xBB\xBB\xBB\xBB----"
220     "\xCC\xCC\xCC\xCC----"
221     "\xDD\xDD\xDD\xDD----"
222     "\xEE\xEE\xEE\xEE----"
223     "\xFF\xFF\xFF\xFF----"
224     "\0"
225     "Use the %s, %s.\0"
226     "Now there are %d of %s!\0"
227     "%c!\0"    // AA
228     "%hhu!\0"  // BB
229     "%hu!\0"   // CC
230     "%u!\0"    // DD
231     "%lu!\0"   // EE
232     "%llu!";   // FF
233 
234 constexpr TokenDatabase kWithArgs = TokenDatabase::Create<kDataWithArguments>();
235 class DetokenizeWithArgs : public ::testing::Test {
236  protected:
DetokenizeWithArgs()237   DetokenizeWithArgs() : detok_(kWithArgs) {}
238 
239   Detokenizer detok_;
240 };
241 
TEST_F(DetokenizeWithArgs,NoMatches)242 TEST_F(DetokenizeWithArgs, NoMatches) {
243   EXPECT_TRUE(detok_.Detokenize("\x23\xab\xc9\x87"sv).matches().empty());
244 }
245 
TEST_F(DetokenizeWithArgs,SingleMatch)246 TEST_F(DetokenizeWithArgs, SingleMatch) {
247   EXPECT_EQ(detok_.Detokenize("\x00\x00\x00\x00"sv).matches().size(), 1u);
248 }
249 
TEST_F(DetokenizeWithArgs,Empty)250 TEST_F(DetokenizeWithArgs, Empty) {
251   EXPECT_EQ(detok_.Detokenize("\x00\x00\x00\x00"sv).BestString(), "");
252 }
253 
TEST_F(DetokenizeWithArgs,Successful)254 TEST_F(DetokenizeWithArgs, Successful) {
255   // Run through test cases, but don't include cases that use %hhu or %llu since
256   // these are not currently supported in arm-none-eabi-gcc.
257   for (auto [data, expected] : TestCases(
258            Case{"\x0A\x0B\x0C\x0D\5force\4Luke"sv, "Use the force, Luke."},
259            Case{"\x0E\x0F\x00\x01\4\4them"sv, "Now there are 2 of them!"},
260            Case{"\xAA\xAA\xAA\xAA\xfc\x01"sv, "~!"},
261            Case{"\xCC\xCC\xCC\xCC\xfe\xff\x07"sv, "65535!"},
262            Case{"\xDD\xDD\xDD\xDD\xfe\xff\x07"sv, "65535!"},
263            Case{"\xDD\xDD\xDD\xDD\xfe\xff\xff\xff\x1f"sv, "4294967295!"},
264            Case{"\xEE\xEE\xEE\xEE\xfe\xff\x07"sv, "65535!"},
265            Case{"\xEE\xEE\xEE\xEE\xfe\xff\xff\xff\x1f"sv, "4294967295!"})) {
266     EXPECT_EQ(detok_.Detokenize(data).BestString(), expected);
267   }
268 }
269 
TEST_F(DetokenizeWithArgs,ExtraDataError)270 TEST_F(DetokenizeWithArgs, ExtraDataError) {
271   auto error = detok_.Detokenize("\x00\x00\x00\x00MORE data"sv);
272   EXPECT_FALSE(error.ok());
273   EXPECT_EQ("", error.BestString());
274 }
275 
TEST_F(DetokenizeWithArgs,MissingArgumentError)276 TEST_F(DetokenizeWithArgs, MissingArgumentError) {
277   auto error = detok_.Detokenize("\x0A\x0B\x0C\x0D\5force"sv);
278   EXPECT_FALSE(error.ok());
279   EXPECT_EQ(error.BestString(), "Use the force, %s.");
280   EXPECT_EQ(error.BestStringWithErrors(),
281             "Use the force, " ERR("%s MISSING") ".");
282 }
283 
TEST_F(DetokenizeWithArgs,DecodingError)284 TEST_F(DetokenizeWithArgs, DecodingError) {
285   auto error = detok_.Detokenize("\x0E\x0F\x00\x01\xFF"sv);
286   EXPECT_FALSE(error.ok());
287   EXPECT_EQ(error.BestString(), "Now there are %d of %s!");
288   EXPECT_EQ(error.BestStringWithErrors(),
289             "Now there are " ERR("%d ERROR") " of " ERR("%s SKIPPED") "!");
290 }
291 
292 constexpr char kDataWithCollisions[] =
293     "TOKENS\0\0"
294     "\x0F\x00\x00\x00"
295     "\0\0\0\0"
296     "\x00\x00\x00\x00\xff\xff\xff\xff"  // 1
297     "\x00\x00\x00\x00\x01\x02\x03\x04"  // 2
298     "\x00\x00\x00\x00\xff\xff\xff\xff"  // 3
299     "\x00\x00\x00\x00\xff\xff\xff\xff"  // 4
300     "\x00\x00\x00\x00\xff\xff\xff\xff"  // 5
301     "\x00\x00\x00\x00\xff\xff\xff\xff"  // 6
302     "\x00\x00\x00\x00\xff\xff\xff\xff"  // 7
303     "\xAA\xAA\xAA\xAA\x00\x00\x00\x00"  // 8
304     "\xAA\xAA\xAA\xAA\xff\xff\xff\xff"  // 9
305     "\xBB\xBB\xBB\xBB\xff\xff\xff\xff"  // A
306     "\xBB\xBB\xBB\xBB\xff\xff\xff\xff"  // B
307     "\xCC\xCC\xCC\xCC\xff\xff\xff\xff"  // C
308     "\xCC\xCC\xCC\xCC\xff\xff\xff\xff"  // D
309     "\xDD\xDD\xDD\xDD\xff\xff\xff\xff"  // E
310     "\xDD\xDD\xDD\xDD\xff\xff\xff\xff"  // F
311     // String table
312     "This string is present\0"   // 1
313     "This string is removed\0"   // 2
314     "One arg %d\0"               // 3
315     "One arg %s\0"               // 4
316     "Two args %s %u\0"           // 5
317     "Two args %s %s %% %% %%\0"  // 6
318     "Four args %d %d %d %d\0"    // 7
319     "This one is removed\0"      // 8
320     "This one is present\0"      // 9
321     "Two ints %d %d\0"           // A
322     "Three ints %d %d %d\0"      // B
323     "Three strings %s %s %s\0"   // C
324     "Two strings %s %s\0"        // D
325     "Three %s %s %s\0"           // E
326     "Five %d %d %d %d %s\0";     // F
327 
328 constexpr TokenDatabase kWithCollisions =
329     TokenDatabase::Create<kDataWithCollisions>();
330 
331 class DetokenizeWithCollisions : public ::testing::Test {
332  protected:
DetokenizeWithCollisions()333   DetokenizeWithCollisions() : detok_(kWithCollisions) {}
334 
335   Detokenizer detok_;
336 };
337 
TEST_F(DetokenizeWithCollisions,Collision_AlwaysPreferSuccessfulDecode)338 TEST_F(DetokenizeWithCollisions, Collision_AlwaysPreferSuccessfulDecode) {
339   for (auto [data, expected] :
340        TestCases(Case{"\0\0\0\0"sv, "This string is present"},
341                  Case{"\0\0\0\0\x01"sv, "One arg -1"},
342                  Case{"\0\0\0\0\x80"sv, "One arg [...]"},
343                  Case{"\0\0\0\0\4Hey!\x04"sv, "Two args Hey! 2"})) {
344     EXPECT_EQ(detok_.Detokenize(data).BestString(), expected);
345   }
346 }
347 
TEST_F(DetokenizeWithCollisions,Collision_PreferDecodingAllBytes)348 TEST_F(DetokenizeWithCollisions, Collision_PreferDecodingAllBytes) {
349   for (auto [data, expected] :
350        TestCases(Case{"\0\0\0\0\x80\x80\x80\x80\x00"sv, "Two args [...] 0"},
351                  Case{"\0\0\0\0\x08?"sv, "One arg %s"},
352                  Case{"\0\0\0\0\x01!\x01\x80"sv, "Two args ! \x80 % % %"})) {
353     EXPECT_EQ(detok_.Detokenize(data).BestString(), expected);
354   }
355 }
356 
TEST_F(DetokenizeWithCollisions,Collision_PreferFewestDecodingErrors)357 TEST_F(DetokenizeWithCollisions, Collision_PreferFewestDecodingErrors) {
358   for (auto [data, expected] :
359        TestCases(Case{"\xBB\xBB\xBB\xBB\x00"sv, "Two ints 0 %d"},
360                  Case{"\xCC\xCC\xCC\xCC\2Yo\5?"sv, "Two strings Yo %s"})) {
361     EXPECT_EQ(detok_.Detokenize(data).BestString(), expected);
362   }
363 }
364 
TEST_F(DetokenizeWithCollisions,Collision_PreferMostDecodedArgs)365 TEST_F(DetokenizeWithCollisions, Collision_PreferMostDecodedArgs) {
366   auto result = detok_.Detokenize("\xDD\xDD\xDD\xDD\x01\x02\x01\x04\x05"sv);
367   EXPECT_EQ((std::string_view)result.matches()[0].value(), "Five -1 1 -1 2 %s");
368   EXPECT_EQ((std::string_view)result.matches()[1].value(), "Three \2 \4 %s"sv);
369 }
370 
TEST_F(DetokenizeWithCollisions,Collision_PreferMostDecodedArgs_NoPercent)371 TEST_F(DetokenizeWithCollisions, Collision_PreferMostDecodedArgs_NoPercent) {
372   // The "Two args %s %s ..." string successfully decodes this, and has more
373   // "arguments", because of %%, but %% doesn't count as as a decoded argument.
374   EXPECT_EQ(detok_.Detokenize("\0\0\0\0\x01\x00\x01\x02"sv).BestString(),
375             "Four args -1 0 -1 1");
376 }
377 
TEST_F(DetokenizeWithCollisions,Collision_PreferStillPresentString)378 TEST_F(DetokenizeWithCollisions, Collision_PreferStillPresentString) {
379   for (auto [data, expected] :
380        TestCases(Case{"\x00\x00\x00\x00"sv, "This string is present"},
381                  Case{"\xAA\xAA\xAA\xAA"sv, "This one is present"})) {
382     EXPECT_EQ(detok_.Detokenize(data).BestString(), expected);
383   }
384 }
385 
TEST_F(DetokenizeWithCollisions,Collision_TracksAllMatches)386 TEST_F(DetokenizeWithCollisions, Collision_TracksAllMatches) {
387   auto result = detok_.Detokenize("\0\0\0\0"sv);
388   EXPECT_EQ(result.matches().size(), 7u);
389 }
390 
391 }  // namespace
392 }  // namespace pw::tokenizer
393