xref: /aosp_15_r20/external/pigweed/pw_tokenizer/generate_decoding_test_data.cc (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 // This program generates Python test data for decoder_test.py.
16 //
17 // To generate the test data, build the target
18 // pw_tokenizer_generate_decoding_test_data. Execute the binary and move the
19 // generated files to this directory.
20 
21 #include <array>
22 #include <cctype>
23 #include <cinttypes>
24 #include <cstdarg>
25 #include <cstdint>
26 #include <cstdio>
27 #include <random>
28 
29 #include "pw_span/span.h"
30 #include "pw_tokenizer/internal/decode.h"
31 #include "pw_tokenizer/tokenize.h"
32 #include "pw_varint/varint.h"
33 
34 namespace {
35 
36 // Defines how to format test cases for the target language.
37 struct SourceFileFormat {
38   const char* extension;
39   const char* comment;
40   const char* header;
41   const char* footer;
42   const char* test_case_prefix;
43   const char* binary_string_prefix;
44   const char* binary_string_suffix;
45 };
46 
47 // clang-format off
48 constexpr const char* kCopyrightLines[] = {
49 "Copyright 2020 The Pigweed Authors",
50 "",
51 "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not",
52 "use this file except in compliance with the License. You may obtain a copy of",
53 "the License at",
54 "",
55 "    https://www.apache.org/licenses/LICENSE-2.0",
56 "",
57 "Unless required by applicable law or agreed to in writing, software",
58 "distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT",
59 "WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the",
60 "License for the specific language governing permissions and limitations under",
61 "the License.",
62 };
63 // clang-format on
64 
65 // The header includes a %s for the name and a %s for the test case type.
66 constexpr const char kCcHeader[] = R"(#pragma once
67 
68 #include <string_view>
69 #include <tuple>
70 
71 namespace pw::test::%s {
72 
73 using namespace std::literals::string_view_literals;
74 
75 // clang-format off
76 using TestCase = %s;
77 
78 inline constexpr TestCase kTestData[] = {
79 )";
80 
81 constexpr const char kCcFooter[] = R"(
82 };
83 
84 }  // namespace pw::test::%s
85 )";
86 
87 constexpr const char kPythonHeader[] = R"("""Generated test data."""
88 
89 # pylint: disable=line-too-long
90 # C++ test case type for %s:
91 #     %s
92 
93 
94 def TestCase(*args):  # pylint: disable=invalid-name
95     return tuple(args)
96 
97 
98 
99 TEST_DATA = (
100 )";
101 
102 constexpr SourceFileFormat kCcFormat{
103     ".h", "//", kCcHeader, kCcFooter, "TestCase", "\"", "\"sv"};
104 
105 constexpr SourceFileFormat kPythonFormat{
106     ".py", "#", kPythonHeader, "\n)\n", "", "b'", "'"};
107 
108 class TestDataFile {
109  public:
TestDataFile(const char * name,const SourceFileFormat & format,const char * test_case_format)110   TestDataFile(const char* name,
111                const SourceFileFormat& format,
112                const char* test_case_format)
113       : format_(format),
114         name_(name),
115         test_case_format_(test_case_format),
116         path_(std::string(name) + "_test_data" + format_.extension),
117         file_(std::fopen(path_.c_str(), "w")) {}
118 
~TestDataFile()119   ~TestDataFile() { std::fclose(file_); }
120 
fmt() const121   const SourceFileFormat& fmt() const { return format_; }
path() const122   const std::string& path() const { return path_; }
123 
124   // Writes a file with test cases uses the provided function.
WriteTestCases(void (* function)(TestDataFile *))125   void WriteTestCases(void (*function)(TestDataFile*)) {
126     static constexpr const char* kFileBase =
127         &__FILE__[std::string_view(__FILE__).find_last_of('/') + 1];
128 
129     for (const char* line : kCopyrightLines) {
130       printf("%s", fmt().comment);
131       if (line[0] == '\0') {
132         printf("\n");
133       } else {
134         printf(" %s\n", line);
135       }
136     }
137 
138     printf("\n%s AUTOGENERATED - DO NOT EDIT\n", fmt().comment);
139     printf("%s This file contains test data generated by %s.\n",
140            fmt().comment,
141            kFileBase);
142 
143     printf(fmt().header, name_, test_case_format_);
144     function(this);
145     printf(fmt().footer, name_);
146   }
147 
148   // Starts a section of test cases in the file.
Section(const char * comment)149   void Section(const char* comment) {
150     printf("\n%s %s\n", fmt().comment, comment);
151   }
152 
printf(const char * format,...)153   int printf(const char* format, ...) PW_PRINTF_FORMAT(2, 3) {
154     va_list args;
155     va_start(args, format);
156     const int result = std::vfprintf(file_, format, args);
157     va_end(args);
158     return result;
159   }
160 
161  private:
162   SourceFileFormat format_;
163   const char* name_;
164   const char* test_case_format_;
165   std::string path_;
166   FILE* file_;
167 };
168 
169 // Writes a decoding test case to the file.
TestCase(TestDataFile * file,pw::span<const uint8_t> buffer,const char * format,const char * formatted)170 void TestCase(TestDataFile* file,
171               pw::span<const uint8_t> buffer,
172               const char* format,
173               const char* formatted) {
174   file->printf(R"(TestCase("%s", "%s", %s)",
175                format,
176                formatted,
177                file->fmt().binary_string_prefix);
178 
179   for (uint8_t byte : buffer) {
180     file->printf("\\x%02x", byte);
181   }
182 
183   file->printf("%s),\n", file->fmt().binary_string_suffix);
184 }
185 
186 template <size_t kSize>
TestCase(TestDataFile * file,const char * format,const char (& buffer)[kSize],const char * formatted)187 void TestCase(TestDataFile* file,
188               const char* format,
189               const char (&buffer)[kSize],
190               const char* formatted) {
191   TestCase(file,
192            pw::span(reinterpret_cast<const uint8_t*>(buffer), kSize - 1),
193            format,
194            formatted);
195 }
196 
197 // __VA_ARGS__ is expanded twice, so ONLY variables / constants should be used.
198 #define MAKE_TEST_CASE(format, ...)                                           \
199   do {                                                                        \
200     std::array<uint8_t, 128> buffer;                                          \
201     size_t size = buffer.size();                                              \
202     PW_TOKENIZE_TO_BUFFER(buffer.data(), &size, format, ##__VA_ARGS__);       \
203                                                                               \
204     std::array<char, 128> formatted = {};                                     \
205     std::snprintf(formatted.data(), formatted.size(), format, ##__VA_ARGS__); \
206     TestCase(file,                                                            \
207              pw::span(buffer).first(size).subspan(4), /* skip the token */    \
208              format,                                                          \
209              formatted.data());                                               \
210   } while (0)
211 
212 // Formats the contents like an error.
213 #define ERROR_STR PW_TOKENIZER_ARG_DECODING_ERROR
214 
215 // Generates data to test tokenized string decoding.
GenerateEncodedStrings(TestDataFile * file)216 void GenerateEncodedStrings(TestDataFile* file) {
217   std::mt19937 random(6006411);
218   std::uniform_int_distribution<int64_t> big;
219   std::uniform_int_distribution<int32_t> medium;
220   std::uniform_int_distribution<int32_t> small(' ', '~');
221   std::uniform_real_distribution<float> real;
222 
223   file->Section("Simple strings");
224   TestCase(file, "%s", "\3SFO", "SFO");
225   TestCase(file, "%s", "\4KSJC", "KSJC");
226   TestCase(file, "%s", "\0", "");
227 
228   TestCase(file, "%5s%s", "\2no\3fun", "   nofun");
229   TestCase(file, "%5s%s", "\6abcdef\0", "abcdef");
230   TestCase(file, "%5s%s", "\0\6abcdef", "     abcdef");
231 
232   TestCase(file,
233            "%s %-6s%s%s%s",
234            "\5Intel\580586\7toaster\1 \4oven",
235            "Intel 80586 toaster oven");
236   TestCase(file,
237            "%s %-6s%s%s%s",
238            "\5Apple\x09"
239            "automatic\7 pencil\1 \x09sharpener",
240            "Apple automatic pencil sharpener");
241 
242   file->Section("Zero-length strings");
243   TestCase(file, "%s-%s", "\x02so\x00", "so-");
244   TestCase(file, "%s-%s", "\x00\04cool", "-cool");
245   TestCase(file, "%s%s%3s%s", "\0\0\0\0", "   ");
246   TestCase(file, "(%5s)(%2s)(%7s)", "\x80\0\x80", "([...])(  )(  [...])");
247 
248   file->Section("Invalid strings");
249   TestCase(file, "%s", "\x03hi", ERROR_STR("%s ERROR (hi)"));
250   TestCase(file, "%30s", "\x03hi", ERROR_STR("%30s ERROR (hi)"));
251   TestCase(file, "%30s", "\x83hi", ERROR_STR("%30s ERROR (hi)"));
252   TestCase(file, "%s", "\x85yo!", ERROR_STR("%s ERROR (yo!)"));
253   TestCase(file, "%s", "\x01", ERROR_STR("%s ERROR"));
254   TestCase(file, "%30s", "\x81", ERROR_STR("%30s ERROR"));
255 
256   file->Section("Continue after truncated string");
257   TestCase(file, "%s %d %s", "\x82go\4\5lunch", "go[...] 2 lunch");
258   TestCase(file, "%6s%s%s", "\x80\x85hello\x05there", " [...]hello[...]there");
259 
260   file->Section("Floating point");
261   TestCase(file, "%1.1f", "\0\0\0\0", "0.0");
262   TestCase(file, "%0.5f", "\xdb\x0f\x49\x40", "3.14159");
263 
264   file->Section("Character");  // ZigZag doubles the value of positive integers.
265   TestCase(file, "%c", "\x40", " ");          // 0x20
266   TestCase(file, "%c", "\x48", "$");          // 0x24
267   TestCase(file, "%c", "\x48", "$");          // 0x24
268   TestCase(file, "100%c!", "\x4A", "100%!");  // 0x25
269 
270   file->Section("Atypical argument types");
271   MAKE_TEST_CASE("%ju", static_cast<uintmax_t>(99));
272   MAKE_TEST_CASE("%jd", static_cast<intmax_t>(99));
273   MAKE_TEST_CASE("%zu", sizeof(uint64_t));
274   MAKE_TEST_CASE("%zd", static_cast<ptrdiff_t>(123));
275   MAKE_TEST_CASE("%td", static_cast<ptrdiff_t>(99));
276 
277   file->Section("Percent character");
278   TestCase(file, "%%", "", "%");
279   TestCase(file, "%%%%%%%%", "abc", "%%%%");
280   TestCase(file, "whoa%%%%wow%%%%!%%", "", "whoa%%wow%%!%");
281   TestCase(file, "This is %d%% effective", "\x02", "This is 1% effective");
282   TestCase(
283       file, "%% is 100%sa%%sign%%%s", "\x01%\x03OK?", "% is 100%a%sign%OK?");
284 
285   file->Section("Percent character prints after errors");
286   TestCase(file, "%s%%", "\x83-10\0", "-10[...]%");
287   TestCase(
288       file, "%d%% is a good %%", "", ERROR_STR("%d MISSING") "% is a good %");
289 
290   file->Section("Various format strings");
291   MAKE_TEST_CASE("!");
292   MAKE_TEST_CASE("%s", "%s");
293   MAKE_TEST_CASE("%s", "hello");
294   MAKE_TEST_CASE("%s%s", "Hello", "old");
295   MAKE_TEST_CASE("%s to the%c%s", "hello", ' ', "whirled");
296   MAKE_TEST_CASE("hello %s %d %d %d", "rolled", 1, 2, 3);
297 
298   TestCase(file, "", "", "");
299   TestCase(file, "This has no specifiers", "", "This has no specifiers");
300   TestCase(file, "%s_or_%3s", "\x05hello\x02hi", "hello_or_ hi");
301   TestCase(file, "%s_or_%3d", "\x05hello\x7f", "hello_or_-64");
302   TestCase(file,
303            "%s or hi%c pi=%1.2e",
304            "\x05hello\x42\xdb\x0f\x49\x40",
305            "hello or hi! pi=3.14e+00");
306   TestCase(file,
307            "Why, %s there. My favorite number is %.2f%c",
308            "\x05hello\xdb\x0f\x49\x40\x42",
309            "Why, hello there. My favorite number is 3.14!");
310 
311   file->Section("Various errors");
312   TestCase(file, "%d", "", ERROR_STR("%d MISSING"));
313 
314   TestCase(file,
315            "ABC%d123%dabc%dABC",
316            "",
317            "ABC" ERROR_STR("%d MISSING") "123" ERROR_STR(
318                "%d SKIPPED") "abc" ERROR_STR("%d SKIPPED") "ABC");
319 
320   TestCase(file,
321            "%sXY%+ldxy%a",
322            "\x83Yo!\x80",
323            "Yo![...]XY" ERROR_STR("%+ld ERROR") "xy" ERROR_STR("%a SKIPPED"));
324 
325   TestCase(file, "%d", "", ERROR_STR("%d MISSING"));
326 
327   TestCase(file,
328            "%sXY%+ldxy%a",
329            "\x83Yo!\x80",
330            "Yo![...]XY" ERROR_STR("%+ld ERROR") "xy" ERROR_STR("%a SKIPPED"));
331 
332   TestCase(file,
333            "%s%lld%9u",
334            "\x81$\x80\x80",
335            "$[...]" ERROR_STR("%lld ERROR") ERROR_STR("%9u SKIPPED"));
336 
337   file->Section("Alternate form (#)");
338   MAKE_TEST_CASE("Hex: %#x", 0xbeef);
339   MAKE_TEST_CASE("Hex: %#08X", 0xfeed);
340 
341   file->Section("Random integers");
342   for (int i = 0; i < 100; ++i) {
343     float f = real(random);
344     MAKE_TEST_CASE(
345         "This is a number: %+08.3e%1.0E%02d%g%G%f%-3f", f, f, i, f, f, f, f);
346   }
347 
348   for (int i = 0; i < 100; ++i) {
349     unsigned long long n1 = big(random);
350     int n2 = medium(random);
351     char ch = static_cast<char>(small(random));
352     if (ch == '"' || ch == '\\') {
353       ch = '\t';
354     }
355 
356     MAKE_TEST_CASE("%s: %llu %d %c", std::to_string(i).c_str(), n1, n2, ch);
357   }
358 
359   for (int i = 0; i < 100; ++i) {
360     const long long n1 = big(random);
361     const unsigned n2 = medium(random);
362     const char ch = static_cast<char>(small(random));
363 
364     MAKE_TEST_CASE(
365         "%s: %lld 0x%16u%08X %d", std::to_string(i).c_str(), n1, n2, n2, ch);
366   }
367 }
368 
369 template <typename T>
OutputVarintTest(TestDataFile * file,T i)370 void OutputVarintTest(TestDataFile* file, T i) {
371   if constexpr (sizeof(T) <= sizeof(int)) {
372     file->printf(R"(TestCase("%%d", "%d", "%%u", "%u", %s)",
373                  static_cast<int>(i),
374                  static_cast<unsigned>(i),
375                  file->fmt().binary_string_prefix);
376   } else {
377     file->printf(R"(TestCase("%%lld", "%lld", "%%llu", "%llu", %s)",
378                  static_cast<long long>(i),
379                  static_cast<unsigned long long>(i),
380                  file->fmt().binary_string_prefix);
381   }
382 
383   std::array<uint8_t, 10> buffer;
384   // All integers are encoded as signed for tokenization.
385   size_t size = pw::varint::Encode(i, pw::as_writable_bytes(pw::span(buffer)));
386 
387   for (size_t j = 0; j < size; ++j) {
388     file->printf("\\x%02x", buffer[j]);
389   }
390 
391   file->printf("%s),\n", file->fmt().binary_string_suffix);
392 }
393 
394 // Generates data to test variable-length integer decoding.
GenerateVarints(TestDataFile * file)395 void GenerateVarints(TestDataFile* file) {
396   std::mt19937 random(6006411);
397   std::uniform_int_distribution<int64_t> signed64;
398   std::uniform_int_distribution<int32_t> signed32;
399   std::uniform_int_distribution<int16_t> signed16;
400 
401   file->Section("Important numbers");
402   OutputVarintTest(file, 0);
403   OutputVarintTest(file, std::numeric_limits<int16_t>::min());
404   OutputVarintTest(file, std::numeric_limits<int16_t>::min() + 1);
405   OutputVarintTest(file, std::numeric_limits<int16_t>::max() - 1);
406   OutputVarintTest(file, std::numeric_limits<int16_t>::max());
407   OutputVarintTest(file, std::numeric_limits<int32_t>::min());
408   OutputVarintTest(file, std::numeric_limits<int32_t>::min() + 1);
409   OutputVarintTest(file, std::numeric_limits<int32_t>::max() - 1);
410   OutputVarintTest(file, std::numeric_limits<int32_t>::max());
411   OutputVarintTest(file, std::numeric_limits<int64_t>::min());
412   OutputVarintTest(file, std::numeric_limits<int64_t>::min() + 1);
413   OutputVarintTest(file, std::numeric_limits<int64_t>::max() - 1);
414   OutputVarintTest(file, std::numeric_limits<int64_t>::max());
415 
416   file->Section("Random 64-bit ints");
417   for (int i = 0; i < 500; ++i) {
418     OutputVarintTest(file, signed64(random));
419   }
420   file->Section("Random 32-bit ints");
421   for (int i = 0; i < 100; ++i) {
422     OutputVarintTest(file, signed32(random));
423   }
424   file->Section("Random 16-bit ints");
425   for (int i = 0; i < 100; ++i) {
426     OutputVarintTest(file, signed16(random));
427   }
428 
429   file->Section("All 8-bit numbers");
430   {
431     int i = std::numeric_limits<int8_t>::min();
432     while (true) {
433       OutputVarintTest(file, i);
434       if (i == std::numeric_limits<int8_t>::max()) {
435         break;
436       }
437       // Don't use an inline increment to avoid undefined behavior (overflow).
438       i += 1;
439     }
440   }
441 }
442 
443 template <typename Function>
WriteFile(const char * name,const char * test_case_format,Function function)444 void WriteFile(const char* name,
445                const char* test_case_format,
446                Function function) {
447   for (const SourceFileFormat& file_format : {kCcFormat, kPythonFormat}) {
448     TestDataFile file(name, file_format, test_case_format);
449     file.WriteTestCases(function);
450 
451     std::printf("Wrote %s\n", file.path().c_str());
452   }
453 }
454 
455 }  // namespace
456 
main(int,char **)457 int main(int, char**) {
458   WriteFile("tokenized_string_decoding",
459             "std::tuple<const char*, std::string_view, std::string_view>",
460             GenerateEncodedStrings);
461   WriteFile("varint_decoding",
462             "std::tuple<const char*, const char*, const char*, const char*, "
463             "std::string_view>",
464             GenerateVarints);
465   return 0;
466 }
467