1#!/usr/bin/env python
2#
3# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT
4# file at the top-level directory of this distribution and at
5# http://rust-lang.org/COPYRIGHT.
6#
7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10# option. This file may not be copied, modified, or distributed
11# except according to those terms.
12
13# This script uses the following Unicode tables:
14# - DerivedNormalizationProps.txt
15# - NormalizationTest.txt
16# - UnicodeData.txt
17# - StandardizedVariants.txt
18#
19# Since this should not require frequent updates, we just store this
20# out-of-line and check the tables.rs and normalization_tests.rs files into git.
21import collections
22import urllib.request
23
24UNICODE_VERSION = "15.0.0"
25UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION
26
27PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
28// file at the top-level directory of this distribution and at
29// http://rust-lang.org/COPYRIGHT.
30//
31// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
32// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
33// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
34// option. This file may not be copied, modified, or distributed
35// except according to those terms.
36
37// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
38
39#![allow(missing_docs)]
40"""
41
42NormalizationTest = collections.namedtuple(
43    "NormalizationTest",
44    ["source", "nfc", "nfd", "nfkc", "nfkd"],
45)
46
47# Mapping taken from Table 12 from:
48# http://www.unicode.org/reports/tr44/#General_Category_Values
49expanded_categories = {
50    'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
51    'Lm': ['L'], 'Lo': ['L'],
52    'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
53    'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
54    'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
55    'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
56    'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
57    'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
58    'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
59}
60
61# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
62# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
63S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28
64S_COUNT = L_COUNT * V_COUNT * T_COUNT
65
66class UnicodeData(object):
67    def __init__(self):
68        self._load_unicode_data()
69        self.norm_props = self._load_norm_props()
70        self.norm_tests = self._load_norm_tests()
71
72        self.canon_comp = self._compute_canonical_comp()
73        self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed()
74
75        self.cjk_compat_variants_fully_decomp = {}
76        self._load_cjk_compat_ideograph_variants()
77
78        def stats(name, table):
79            count = sum(len(v) for v in table.values())
80            print("%s: %d chars => %d decomposed chars" % (name, len(table), count))
81
82        print("Decomposition table stats:")
83        stats("Canonical decomp", self.canon_decomp)
84        stats("Compatible decomp", self.compat_decomp)
85        stats("Canonical fully decomp", self.canon_fully_decomp)
86        stats("Compatible fully decomp", self.compat_fully_decomp)
87        stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp)
88
89        self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables()
90
91    def _fetch(self, filename):
92        resp = urllib.request.urlopen(UCD_URL + filename)
93        return resp.read().decode('utf-8')
94
95    def _load_unicode_data(self):
96        self.name_to_char_int = {}
97        self.combining_classes = {}
98        self.compat_decomp = {}
99        self.canon_decomp = {}
100        self.general_category_mark = []
101        self.general_category_public_assigned = []
102
103        assigned_start = 0;
104        prev_char_int = -1;
105        prev_name = "";
106
107        for line in self._fetch("UnicodeData.txt").splitlines():
108            # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
109            pieces = line.split(';')
110            assert len(pieces) == 15
111            char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5]
112            char_int = int(char, 16)
113
114            name = pieces[1].strip()
115            self.name_to_char_int[name] = char_int
116
117            if cc != '0':
118                self.combining_classes[char_int] = cc
119
120            if decomp.startswith('<'):
121                self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]]
122            elif decomp != '':
123                self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()]
124
125            if category == 'M' or 'M' in expanded_categories.get(category, []):
126                self.general_category_mark.append(char_int)
127
128            assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt"
129            if category not in ['Co', 'Cs']:
130                if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name):
131                    self.general_category_public_assigned.append((assigned_start, prev_char_int))
132                    assigned_start = char_int
133                prev_char_int = char_int
134                prev_name = name;
135
136        self.general_category_public_assigned.append((assigned_start, prev_char_int))
137
138    def _load_cjk_compat_ideograph_variants(self):
139        for line in self._fetch("StandardizedVariants.txt").splitlines():
140            strip_comments = line.split('#', 1)[0].strip()
141            if not strip_comments:
142                continue
143
144            variation_sequence, description, differences = strip_comments.split(';')
145            description = description.strip()
146
147            # Don't use variations that only apply in particular shaping environments.
148            if differences:
149                continue
150
151            # Look for entries where the description field is a codepoint name.
152            if description not in self.name_to_char_int:
153                continue
154
155            # Only consider the CJK Compatibility Ideographs.
156            if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'):
157                continue
158
159            char_int = self.name_to_char_int[description]
160
161            assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class"
162            assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition"
163            assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition"
164            # If we ever need to handle Hangul here, we'll need to handle it separately.
165            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
166
167            cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()]
168            for c in cjk_compat_variant_parts:
169                assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)"
170                assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)"
171            self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts
172
173    def _load_norm_props(self):
174        props = collections.defaultdict(list)
175
176        for line in self._fetch("DerivedNormalizationProps.txt").splitlines():
177            (prop_data, _, _) = line.partition("#")
178            prop_pieces = prop_data.split(";")
179
180            if len(prop_pieces) < 2:
181                continue
182
183            assert len(prop_pieces) <= 3
184            (low, _, high) = prop_pieces[0].strip().partition("..")
185
186            prop = prop_pieces[1].strip()
187
188            data = None
189            if len(prop_pieces) == 3:
190                data = prop_pieces[2].strip()
191
192            props[prop].append((low, high, data))
193
194        return props
195
196    def _load_norm_tests(self):
197        tests = []
198        for line in self._fetch("NormalizationTest.txt").splitlines():
199            (test_data, _, _) = line.partition("#")
200            test_pieces = test_data.split(";")
201
202            if len(test_pieces) < 5:
203                continue
204
205            source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]]
206            tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd))
207
208        return tests
209
210    def _compute_canonical_comp(self):
211        canon_comp = {}
212        comp_exclusions = [
213            (int(low, 16), int(high or low, 16))
214            for low, high, _ in self.norm_props["Full_Composition_Exclusion"]
215        ]
216        for char_int, decomp in self.canon_decomp.items():
217            if any(lo <= char_int <= hi for lo, hi in comp_exclusions):
218                continue
219
220            assert len(decomp) == 2
221            assert (decomp[0], decomp[1]) not in canon_comp
222            canon_comp[(decomp[0], decomp[1])] = char_int
223
224        return canon_comp
225
226    def _compute_fully_decomposed(self):
227        """
228        Even though the decomposition algorithm is recursive, it is possible
229        to precompute the recursion at table generation time with modest
230        increase to the table size.  Then, for these precomputed tables, we
231        note that 1) compatible decomposition is a subset of canonical
232        decomposition and 2) they mostly agree on their intersection.
233        Therefore, we don't store entries in the compatible table for
234        characters that decompose the same way under canonical decomposition.
235
236            Decomposition table stats:
237            Canonical decomp: 2060 chars => 3085 decomposed chars
238            Compatible decomp: 3662 chars => 5440 decomposed chars
239            Canonical fully decomp: 2060 chars => 3404 decomposed chars
240            Compatible fully decomp: 3678 chars => 5599 decomposed chars
241
242        The upshot is that decomposition code is very simple and easy to inline
243        at mild code size cost.
244        """
245        def _decompose(char_int, compatible):
246            # 7-bit ASCII never decomposes
247            if char_int <= 0x7f:
248                yield char_int
249                return
250
251            # Assert that we're handling Hangul separately.
252            assert not (S_BASE <= char_int < S_BASE + S_COUNT)
253
254            decomp = self.canon_decomp.get(char_int)
255            if decomp is not None:
256                for decomposed_ch in decomp:
257                    for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
258                        yield fully_decomposed_ch
259                return
260
261            if compatible and char_int in self.compat_decomp:
262                for decomposed_ch in self.compat_decomp[char_int]:
263                    for fully_decomposed_ch in _decompose(decomposed_ch, compatible):
264                        yield fully_decomposed_ch
265                return
266
267            yield char_int
268            return
269
270        end_codepoint = max(
271            max(self.canon_decomp.keys()),
272            max(self.compat_decomp.keys()),
273        )
274
275        canon_fully_decomp = {}
276        compat_fully_decomp = {}
277
278        for char_int in range(0, end_codepoint + 1):
279            # Always skip Hangul, since it's more efficient to represent its
280            # decomposition programmatically.
281            if S_BASE <= char_int < S_BASE + S_COUNT:
282                continue
283
284            canon = list(_decompose(char_int, False))
285            if not (len(canon) == 1 and canon[0] == char_int):
286                canon_fully_decomp[char_int] = canon
287
288            compat = list(_decompose(char_int, True))
289            if not (len(compat) == 1 and compat[0] == char_int):
290                compat_fully_decomp[char_int] = compat
291
292        # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't
293        # need to store their overlap when they agree.  When they don't agree,
294        # store the decomposition in the compatibility table since we'll check
295        # that first when normalizing to NFKD.
296        assert set(canon_fully_decomp) <= set(compat_fully_decomp)
297
298        for ch in set(canon_fully_decomp) & set(compat_fully_decomp):
299            if canon_fully_decomp[ch] == compat_fully_decomp[ch]:
300                del compat_fully_decomp[ch]
301
302        return canon_fully_decomp, compat_fully_decomp
303
304    def _compute_stream_safe_tables(self):
305        """
306        To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4),
307        we need to be able to know the number of contiguous non-starters *after*
308        applying compatibility decomposition to each character.
309
310        We can do this incrementally by computing the number of leading and
311        trailing non-starters for each character's compatibility decomposition
312        with the following rules:
313
314        1) If a character is not affected by compatibility decomposition, look
315           up its canonical combining class to find out if it's a non-starter.
316        2) All Hangul characters are starters, even under decomposition.
317        3) Otherwise, very few decomposing characters have a nonzero count
318           of leading or trailing non-starters, so store these characters
319           with their associated counts in a separate table.
320        """
321        leading_nonstarters = {}
322        trailing_nonstarters = {}
323
324        for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp):
325            decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c]
326
327            num_leading = 0
328            for d in decomposed:
329                if d not in self.combining_classes:
330                    break
331                num_leading += 1
332
333            num_trailing = 0
334            for d in reversed(decomposed):
335                if d not in self.combining_classes:
336                    break
337                num_trailing += 1
338
339            if num_leading > 0:
340                leading_nonstarters[c] = num_leading
341            if num_trailing > 0:
342                trailing_nonstarters[c] = num_trailing
343
344        return leading_nonstarters, trailing_nonstarters
345
346hexify = lambda c: '{:04X}'.format(c)
347
348# Test whether `first` and `last` are corresponding "<..., First>" and
349# "<..., Last>" markers.
350def is_first_and_last(first, last):
351    if not first.startswith('<') or not first.endswith(', First>'):
352        return False
353    if not last.startswith('<') or not last.endswith(', Last>'):
354        return False
355    return first[1:-8] == last[1:-7]
356
357def gen_mph_data(name, d, kv_type, kv_callback):
358    (salt, keys) = minimal_perfect_hash(d)
359    out.write("pub(crate) const %s_SALT: &[u16] = &[\n" % name.upper())
360    for s in salt:
361        out.write("    0x{:x},\n".format(s))
362    out.write("];\n")
363    out.write("pub(crate) const {}_KV: &[{}] = &[\n".format(name.upper(), kv_type))
364    for k in keys:
365        out.write("    {},\n".format(kv_callback(k)))
366    out.write("];\n\n")
367
368def gen_combining_class(combining_classes, out):
369    gen_mph_data('canonical_combining_class', combining_classes, 'u32',
370        lambda k: "0x{:X}".format(int(combining_classes[k]) | (k << 8)))
371
372def gen_composition_table(canon_comp, out):
373    table = {}
374    for (c1, c2), c3 in canon_comp.items():
375        if c1 < 0x10000 and c2 < 0x10000:
376            table[(c1 << 16) | c2] = c3
377    (salt, keys) = minimal_perfect_hash(table)
378    gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)',
379        lambda k: "(0x%s, '\\u{%s}')" % (hexify(k), hexify(table[k])))
380
381    out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n")
382    out.write("    match (c1, c2) {\n")
383    for (c1, c2), c3 in sorted(canon_comp.items()):
384        if c1 >= 0x10000 and c2 >= 0x10000:
385            out.write("        ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3)))
386
387    out.write("        _ => None,\n")
388    out.write("    }\n")
389    out.write("}\n")
390
391def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out):
392    tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')]
393    for table, name in tables:
394        offsets = {}
395        offset = 0
396        out.write("pub(crate) const %s_DECOMPOSED_CHARS: &[char] = &[\n" % name.upper())
397        for k, v in table.items():
398            offsets[k] = offset
399            offset += len(v)
400            for c in v:
401                out.write("    '\\u{%s}',\n" % hexify(c))
402        # The largest offset must fit in a u16.
403        assert offset < 65536
404        out.write("];\n")
405        gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))",
406            lambda k: "(0x{:x}, ({}, {}))".format(k, offsets[k], len(table[k])))
407
408def gen_qc_match(prop_table, out):
409    out.write("    match c {\n")
410
411    for low, high, data in prop_table:
412        assert data in ('N', 'M')
413        result = "No" if data == 'N' else "Maybe"
414        if high:
415            out.write(r"        '\u{%s}'...'\u{%s}' => %s," % (low, high, result))
416        else:
417            out.write(r"        '\u{%s}' => %s," % (low, result))
418        out.write("\n")
419
420    out.write("        _ => Yes,\n")
421    out.write("    }\n")
422
423def gen_nfc_qc(prop_tables, out):
424    out.write("#[inline]\n")
425    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
426    out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n")
427    gen_qc_match(prop_tables['NFC_QC'], out)
428    out.write("}\n")
429
430def gen_nfkc_qc(prop_tables, out):
431    out.write("#[inline]\n")
432    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
433    out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n")
434    gen_qc_match(prop_tables['NFKC_QC'], out)
435    out.write("}\n")
436
437def gen_nfd_qc(prop_tables, out):
438    out.write("#[inline]\n")
439    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
440    out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n")
441    gen_qc_match(prop_tables['NFD_QC'], out)
442    out.write("}\n")
443
444def gen_nfkd_qc(prop_tables, out):
445    out.write("#[inline]\n")
446    out.write("#[allow(ellipsis_inclusive_range_patterns)]\n")
447    out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n")
448    gen_qc_match(prop_tables['NFKD_QC'], out)
449    out.write("}\n")
450
451def gen_combining_mark(general_category_mark, out):
452    gen_mph_data('combining_mark', general_category_mark, 'u32',
453        lambda k: '0x{:04x}'.format(k))
454
455def gen_public_assigned(general_category_public_assigned, out):
456    # This could be done as a hash but the table is somewhat small.
457    out.write("#[inline]\n")
458    out.write("pub fn is_public_assigned(c: char) -> bool {\n")
459    out.write("    match c {\n")
460
461    start = True
462    for first, last in general_category_public_assigned:
463        if start:
464            out.write("        ")
465            start = False
466        else:
467            out.write("        | ")
468        if first == last:
469            out.write("'\\u{%s}'\n" % hexify(first))
470        else:
471            out.write("'\\u{%s}'..='\\u{%s}'\n" % (hexify(first), hexify(last)))
472    out.write("        => true,\n")
473
474    out.write("        _ => false,\n")
475    out.write("    }\n")
476    out.write("}\n")
477    out.write("\n")
478
479def gen_stream_safe(leading, trailing, out):
480    # This could be done as a hash but the table is very small.
481    out.write("#[inline]\n")
482    out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n")
483    out.write("    match c {\n")
484
485    for char, num_leading in sorted(leading.items()):
486        out.write("        '\\u{%s}' => %d,\n" % (hexify(char), num_leading))
487
488    out.write("        _ => 0,\n")
489    out.write("    }\n")
490    out.write("}\n")
491    out.write("\n")
492
493    gen_mph_data('trailing_nonstarters', trailing, 'u32',
494        lambda k: "0x{:X}".format(int(trailing[k]) | (k << 8)))
495
496def gen_tests(tests, out):
497    out.write("""#[derive(Debug)]
498pub struct NormalizationTest {
499    pub source: &'static str,
500    pub nfc: &'static str,
501    pub nfd: &'static str,
502    pub nfkc: &'static str,
503    pub nfkd: &'static str,
504}
505
506""")
507
508    out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n")
509    str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s)
510
511    for test in tests:
512        out.write("    NormalizationTest {\n")
513        out.write("        source: %s,\n" % str_literal(test.source))
514        out.write("        nfc: %s,\n" % str_literal(test.nfc))
515        out.write("        nfd: %s,\n" % str_literal(test.nfd))
516        out.write("        nfkc: %s,\n" % str_literal(test.nfkc))
517        out.write("        nfkd: %s,\n" % str_literal(test.nfkd))
518        out.write("    },\n")
519
520    out.write("];\n")
521
522# Guaranteed to be less than n.
523def my_hash(x, salt, n):
524    # This is hash based on the theory that multiplication is efficient
525    mask_32 = 0xffffffff
526    y = ((x + salt) * 2654435769) & mask_32
527    y ^= (x * 0x31415926) & mask_32
528    return (y * n) >> 32
529
530# Compute minimal perfect hash function, d can be either a dict or list of keys.
531def minimal_perfect_hash(d):
532    n = len(d)
533    buckets = dict((h, []) for h in range(n))
534    for key in d:
535        h = my_hash(key, 0, n)
536        buckets[h].append(key)
537    bsorted = [(len(buckets[h]), h) for h in range(n)]
538    bsorted.sort(reverse = True)
539    claimed = [False] * n
540    salts = [0] * n
541    keys = [0] * n
542    for (bucket_size, h) in bsorted:
543        # Note: the traditional perfect hashing approach would also special-case
544        # bucket_size == 1 here and assign any empty slot, rather than iterating
545        # until rehash finds an empty slot. But we're not doing that so we can
546        # avoid the branch.
547        if bucket_size == 0:
548            break
549        else:
550            for salt in range(1, 32768):
551                rehashes = [my_hash(key, salt, n) for key in buckets[h]]
552                # Make sure there are no rehash collisions within this bucket.
553                if all(not claimed[hash] for hash in rehashes):
554                    if len(set(rehashes)) < bucket_size:
555                        continue
556                    salts[h] = salt
557                    for key in buckets[h]:
558                        rehash = my_hash(key, salt, n)
559                        claimed[rehash] = True
560                        keys[rehash] = key
561                    break
562            if salts[h] == 0:
563                print("minimal perfect hashing failed")
564                # Note: if this happens (because of unfortunate data), then there are
565                # a few things that could be done. First, the hash function could be
566                # tweaked. Second, the bucket order could be scrambled (especially the
567                # singletons). Right now, the buckets are sorted, which has the advantage
568                # of being deterministic.
569                #
570                # As a more extreme approach, the singleton bucket optimization could be
571                # applied (give the direct address for singleton buckets, rather than
572                # relying on a rehash). That is definitely the more standard approach in
573                # the minimal perfect hashing literature, but in testing the branch was a
574                # significant slowdown.
575                exit(1)
576    return (salts, keys)
577
578if __name__ == '__main__':
579    data = UnicodeData()
580    with open("tables.rs", "w", newline = "\n") as out:
581        out.write(PREAMBLE)
582        out.write("use crate::quick_check::IsNormalized;\n")
583        out.write("use crate::quick_check::IsNormalized::*;\n")
584        out.write("\n")
585
586        version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split("."))
587        out.write("#[allow(unused)]\n")
588        out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n\n" % version)
589
590        gen_combining_class(data.combining_classes, out)
591        out.write("\n")
592
593        gen_composition_table(data.canon_comp, out)
594        out.write("\n")
595
596        gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out)
597
598        gen_combining_mark(data.general_category_mark, out)
599        out.write("\n")
600
601        gen_public_assigned(data.general_category_public_assigned, out)
602        out.write("\n")
603
604        gen_nfc_qc(data.norm_props, out)
605        out.write("\n")
606
607        gen_nfkc_qc(data.norm_props, out)
608        out.write("\n")
609
610        gen_nfd_qc(data.norm_props, out)
611        out.write("\n")
612
613        gen_nfkd_qc(data.norm_props, out)
614        out.write("\n")
615
616        gen_stream_safe(data.ss_leading, data.ss_trailing, out)
617        out.write("\n")
618
619    with open("normalization_tests.rs", "w", newline = "\n") as out:
620        out.write(PREAMBLE)
621        gen_tests(data.norm_tests, out)
622