1#!/bin/sh 2# Copyright 2014 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6# References: 7# https://encoding.spec.whatwg.org/#euc-jp 8# https://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 9# https://www.iana.org/assignments/charset-reg/CP51932 10# Table 3-64 in CJKV Information Processing 2/e. 11 12# Download the following two files, run it in source/data/mappings directory 13# and save the result to euc-jp-html5.ucm 14# https://encoding.spec.whatwg.org/index-jis0208.txt 15# https://encoding.spec.whatwg.org/index-jis0212.txt 16 17function preamble { 18cat <<PREAMBLE 19# *************************************************************************** 20# * 21# * Copyright (C) 1995-2014, International Business Machines 22# * Corporation and others. All Rights Reserved. 23# * 24# * Generated per the algorithm for EUC-JP 25# * described at https://encoding.spec.whatwg.org/#euc-jp. 26# * 27# *************************************************************************** 28<code_set_name> "euc-jp-html" 29<char_name_mask> "AXXXX" 30<mb_cur_max> 3 31<mb_cur_min> 1 32<uconv_class> "MBCS" 33<subchar> \x3F 34<icu:charsetFamily> "ASCII" 35 36<icu:state> 0-7f, 8e:2, 8f:3, a1-fe:1 37<icu:state> a1-fe 38<icu:state> a1-df 39<icu:state> a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4 40<icu:state> a1-fe.u 41 42CHARMAP 43PREAMBLE 44} 45 46#<U0000> \x00 |0 47function ascii { 48 for i in $(seq 0 127) 49 do 50 printf '<U%04X> \\x%02X |0\n' $i $i 51 done 52} 53 54 55# Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F 56function half_width_kana { 57 for i in $(seq 0xA1 0xDF) 58 do 59 # 65377 = 0xFF61, 161 = 0xA1 60 printf '<U%04X> \\x8E\\x%02X |0\n' $(($i + 65377 - 161)) $i 61 done 62} 63 64 65# index-jis0208.txt has index pointers larger than the size of 66# the encoding space available in 2-byte Graphic plane of ISO-2022-based 67# encoding (94 x 94 = 8836). We have to exclude them because they're for 68# Shift-JIS. 69# In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries. 70# All the bi-directional mapping entries come *before* the uni-directional 71# (EUC-JP to Unicode) entries so that we put '|3' if we have seen 72# the same Unicode code point earlier in the list. According to the definition 73# of 'index pointer' in the W3C encoding spec, it's the first entry in the 74# file for a given Unicode code point. 75 76function jis208 { 77 awk '!/^#/ && !/^$/ && $1 <= 8836 \ 78 { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ 79 $1 / 94 + 0xA1, $1 % 94 + 0xA1,\ 80 ($2 in uset) ? 3 : 0); \ 81 uset[$2] = 1; 82 }' \ 83 index-jis0208.txt 84} 85 86# JIS X 212 is for decoding only (use '|3' to denote that). 87 88function jis212 { 89 awk '!/^#/ && !/^$/ \ 90 { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\ 91 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \ 92 index-jis0212.txt 93} 94 95function unsorted_table { 96 ascii 97 half_width_kana 98 jis208 99 jis212 100 echo '<U00A5> \x5C |1' 101 echo '<U203E> \x7E |1' 102 echo '<U2212> \xA1\xDD |1' 103} 104 105wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt 106wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0212.txt 107preamble 108unsorted_table | sort | uniq 109echo 'END CHARMAP' 110