xref: /aosp_15_r20/external/cronet/third_party/icu/scripts/eucjp_gen.sh (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1#!/bin/sh
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6# References:
7#   https://encoding.spec.whatwg.org/#euc-jp
8#   https://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
9#   https://www.iana.org/assignments/charset-reg/CP51932
10#   Table 3-64 in CJKV Information Processing 2/e.
11
12# Download the following two files, run it in source/data/mappings directory
13# and save the result to euc-jp-html5.ucm
14#   https://encoding.spec.whatwg.org/index-jis0208.txt
15#   https://encoding.spec.whatwg.org/index-jis0212.txt
16
17function preamble {
18cat <<PREAMBLE
19# ***************************************************************************
20# *
21# *   Copyright (C) 1995-2014, International Business Machines
22# *   Corporation and others.  All Rights Reserved.
23# *
24# *   Generated per the algorithm for EUC-JP
25# *   described at https://encoding.spec.whatwg.org/#euc-jp.
26# *
27# ***************************************************************************
28<code_set_name>               "euc-jp-html"
29<char_name_mask>              "AXXXX"
30<mb_cur_max>                  3
31<mb_cur_min>                  1
32<uconv_class>                 "MBCS"
33<subchar>                     \x3F
34<icu:charsetFamily>           "ASCII"
35
36<icu:state>                   0-7f, 8e:2, 8f:3, a1-fe:1
37<icu:state>                   a1-fe
38<icu:state>                   a1-df
39<icu:state>                   a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4
40<icu:state>                   a1-fe.u
41
42CHARMAP
43PREAMBLE
44}
45
46#<U0000> \x00 |0
47function ascii {
48  for i in $(seq 0 127)
49  do
50    printf '<U%04X> \\x%02X |0\n' $i $i
51  done
52}
53
54
55# Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F
56function half_width_kana {
57  for i in $(seq 0xA1 0xDF)
58  do
59    # 65377 = 0xFF61, 161 = 0xA1
60    printf '<U%04X> \\x8E\\x%02X |0\n' $(($i + 65377 - 161))  $i
61  done
62}
63
64
65# index-jis0208.txt has index pointers larger than the size of
66# the encoding space available in 2-byte Graphic plane of ISO-2022-based
67# encoding (94 x 94 = 8836). We have to exclude them because they're for
68# Shift-JIS.
69# In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries.
70# All the bi-directional mapping entries come *before* the uni-directional
71# (EUC-JP to Unicode) entries so that we put '|3' if we have seen
72# the same Unicode code point earlier in the list. According to the definition
73# of 'index pointer' in the W3C encoding spec, it's the first entry in the
74# file for a given Unicode code point.
75
76function jis208 {
77  awk '!/^#/ && !/^$/ && $1 <= 8836  \
78       { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
79                 $1 / 94 + 0xA1, $1 % 94 + 0xA1,\
80                 ($2 in uset) ? 3 : 0); \
81         uset[$2] = 1;
82       }' \
83  index-jis0208.txt
84}
85
86# JIS X 212 is for decoding only (use '|3' to denote that).
87
88function jis212 {
89  awk '!/^#/ && !/^$/ \
90       { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\
91                 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \
92  index-jis0212.txt
93}
94
95function unsorted_table {
96  ascii
97  half_width_kana
98  jis208
99  jis212
100  echo '<U00A5> \x5C |1'
101  echo '<U203E> \x7E |1'
102  echo '<U2212> \xA1\xDD |1'
103}
104
105wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt
106wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0212.txt
107preamble
108unsorted_table | sort  | uniq
109echo 'END CHARMAP'
110