xref: /aosp_15_r20/external/cronet/third_party/icu/scripts/big5_gen.sh (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1#!/bin/sh
2# Copyright 2015 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6# References:
7#   https://encoding.spec.whatwg.org/#big5
8
9# This script downloads the following file.
10#   https://encoding.spec.whatwg.org/index-big5.txt
11
12function preamble {
13cat <<PREAMBLE
14# ***************************************************************************
15# *
16# *   Copyright (C) 1995-2014, International Business Machines
17# *   Corporation and others.  All Rights Reserved.
18# *
19# *   Generated per the algorithm for Big5
20# *   described at http://encoding.spec.whatwg.org/#big5
21# *
22# ***************************************************************************
23<code_set_name>               "big5-html"
24<char_name_mask>              "AXXXX"
25<mb_cur_max>                  2
26<mb_cur_min>                  1
27<uconv_class>                 "MBCS"
28<subchar>                     \x3F
29<icu:charsetFamily>           "ASCII"
30
31# 'p' is for the range that may produce non-BMP code points.
32# 'i' is to make the code range illegal.
33# Big5 has a lot of small holes in the 2nd byte. If it's in the ASCII range,
34# the 2nd byte has to be added back to the stream to be compliant to the
35# encoding spec. Each state adds 1kB in the data size.
36# See http://userguide.icu-project.org/conversion/data.
37<icu:state>                   0-7f, a1-fe:1, 87-a0:2, c8:2, fa-fe:2, 87:3, 89:4, 8a:5, 8b:6, 8d:7, 9b:8, 9f:9, a0:a
38<icu:state>                   40-7e, a1-fe
39<icu:state>                   40-7e.p, a1-fe.p
40<icu:state>                   40-7e.p, a1-fe.p, 66.i
41<icu:state>                   40-7e.p, a1-fe.p, 42.i, 44.i, 45.i, 4a-4b.i
42<icu:state>                   40-7e.p, a1-fe.p, 42.i, 63.i, 75.i
43<icu:state>                   40-7e.p, a1-fe.p, 54.i
44<icu:state>                   40-7e.p, a1-fe.p, 41.i
45<icu:state>                   40-7e.p, a1-fe.p, 61.i
46<icu:state>                   40-7e.p, a1-fe.p, 4e.i
47<icu:state>                   40-7e.p, a1-fe.p, 54.i, 57.i, 5a.i, 62.i, 72.i
48
49CHARMAP
50PREAMBLE
51}
52
53function ascii {
54  for i in $(seq 0 127)
55  do
56    printf '<U%04X> \\x%02X |0\n' $i $i
57  done
58}
59
60
61# HKSCS characters are not supported in encoding ( |lead < 0xA1| )
62# Entries with pointer=528[79] and 5247 ~ 5250 have to be decoding-only
63# even though they come before the other entry with the same Unicode
64# character. The corresponding Unicode characters are U+255[0E],
65# U+256[1A], and U+534[15].
66# See https://www.w3.org/Bugs/Public/show_bug.cgi?id=27878
67function big5 {
68  awk '!/^#/ && !/^$/ \
69       { pointer = $1; \
70         ucs = substr($2, 3); \
71         sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs;
72         lead = pointer / 157 + 0x81; \
73         is_decoding_only = lead < 0xA1 || seen_before[ucs] || \
74             pointer == 5287 || pointer == 5289 || \
75             (5247 <= pointer && pointer <= 5250);
76         trail = $1 % 157; \
77         trail_offset = trail < 0x3F ? 0x40 : 0x62; \
78         tag = (is_decoding_only ? 3 : 0); \
79         printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\
80                 lead,  trail + trail_offset, tag, sortkey);\
81         seen_before[ucs] = is_decoding_only ? 0 : 1; \
82       }' \
83  index-big5.txt
84}
85
86function two_char_seq {
87cat <<EOF
88<U00CA><U0304> \x88\x62 |3 000CA
89<U00CA><U030C> \x88\x64 |3 000CA
90<U00EA><U0304> \x88\xA3 |3 000EA
91<U00EA><U030C> \x88\xA5 |3 000EA
92EOF
93}
94
95function unsorted_table {
96  two_char_seq
97  big5
98}
99
100wget -N -r -nd https://encoding.spec.whatwg.org/index-big5.txt
101preamble
102ascii
103unsorted_table | sort -k4  | uniq | cut -f 1-3 -d ' '
104echo 'END CHARMAP'
105