1#!/bin/sh 2# Copyright 2015 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6# References: 7# https://encoding.spec.whatwg.org/#big5 8 9# This script downloads the following file. 10# https://encoding.spec.whatwg.org/index-big5.txt 11 12function preamble { 13cat <<PREAMBLE 14# *************************************************************************** 15# * 16# * Copyright (C) 1995-2014, International Business Machines 17# * Corporation and others. All Rights Reserved. 18# * 19# * Generated per the algorithm for Big5 20# * described at http://encoding.spec.whatwg.org/#big5 21# * 22# *************************************************************************** 23<code_set_name> "big5-html" 24<char_name_mask> "AXXXX" 25<mb_cur_max> 2 26<mb_cur_min> 1 27<uconv_class> "MBCS" 28<subchar> \x3F 29<icu:charsetFamily> "ASCII" 30 31# 'p' is for the range that may produce non-BMP code points. 32# 'i' is to make the code range illegal. 33# Big5 has a lot of small holes in the 2nd byte. If it's in the ASCII range, 34# the 2nd byte has to be added back to the stream to be compliant to the 35# encoding spec. Each state adds 1kB in the data size. 36# See http://userguide.icu-project.org/conversion/data. 37<icu:state> 0-7f, a1-fe:1, 87-a0:2, c8:2, fa-fe:2, 87:3, 89:4, 8a:5, 8b:6, 8d:7, 9b:8, 9f:9, a0:a 38<icu:state> 40-7e, a1-fe 39<icu:state> 40-7e.p, a1-fe.p 40<icu:state> 40-7e.p, a1-fe.p, 66.i 41<icu:state> 40-7e.p, a1-fe.p, 42.i, 44.i, 45.i, 4a-4b.i 42<icu:state> 40-7e.p, a1-fe.p, 42.i, 63.i, 75.i 43<icu:state> 40-7e.p, a1-fe.p, 54.i 44<icu:state> 40-7e.p, a1-fe.p, 41.i 45<icu:state> 40-7e.p, a1-fe.p, 61.i 46<icu:state> 40-7e.p, a1-fe.p, 4e.i 47<icu:state> 40-7e.p, a1-fe.p, 54.i, 57.i, 5a.i, 62.i, 72.i 48 49CHARMAP 50PREAMBLE 51} 52 53function ascii { 54 for i in $(seq 0 127) 55 do 56 printf '<U%04X> \\x%02X |0\n' $i $i 57 done 58} 59 60 61# HKSCS characters are not supported in encoding ( |lead < 0xA1| ) 62# Entries with pointer=528[79] and 5247 ~ 5250 have to be decoding-only 63# even though they come before the other entry with the same Unicode 64# character. The corresponding Unicode characters are U+255[0E], 65# U+256[1A], and U+534[15]. 66# See https://www.w3.org/Bugs/Public/show_bug.cgi?id=27878 67function big5 { 68 awk '!/^#/ && !/^$/ \ 69 { pointer = $1; \ 70 ucs = substr($2, 3); \ 71 sortkey = (length(ucs) < 5) ? ("0" ucs) : ucs; 72 lead = pointer / 157 + 0x81; \ 73 is_decoding_only = lead < 0xA1 || seen_before[ucs] || \ 74 pointer == 5287 || pointer == 5289 || \ 75 (5247 <= pointer && pointer <= 5250); 76 trail = $1 % 157; \ 77 trail_offset = trail < 0x3F ? 0x40 : 0x62; \ 78 tag = (is_decoding_only ? 3 : 0); \ 79 printf ("<U%4s> \\x%02X\\x%02X |%d %s\n", ucs,\ 80 lead, trail + trail_offset, tag, sortkey);\ 81 seen_before[ucs] = is_decoding_only ? 0 : 1; \ 82 }' \ 83 index-big5.txt 84} 85 86function two_char_seq { 87cat <<EOF 88<U00CA><U0304> \x88\x62 |3 000CA 89<U00CA><U030C> \x88\x64 |3 000CA 90<U00EA><U0304> \x88\xA3 |3 000EA 91<U00EA><U030C> \x88\xA5 |3 000EA 92EOF 93} 94 95function unsorted_table { 96 two_char_seq 97 big5 98} 99 100wget -N -r -nd https://encoding.spec.whatwg.org/index-big5.txt 101preamble 102ascii 103unsorted_table | sort -k4 | uniq | cut -f 1-3 -d ' ' 104echo 'END CHARMAP' 105