xref: /aosp_15_r20/external/icu/libicu/cts_headers/csrmbcs.h (revision 0e209d3975ff4a8c132096b14b0e9364a753506e)
1*0e209d39SAndroid Build Coastguard Worker // © 2016 and later: Unicode, Inc. and others.
2*0e209d39SAndroid Build Coastguard Worker // License & terms of use: http://www.unicode.org/copyright.html
3*0e209d39SAndroid Build Coastguard Worker /*
4*0e209d39SAndroid Build Coastguard Worker  **********************************************************************
5*0e209d39SAndroid Build Coastguard Worker  *   Copyright (C) 2005-2012, International Business Machines
6*0e209d39SAndroid Build Coastguard Worker  *   Corporation and others.  All Rights Reserved.
7*0e209d39SAndroid Build Coastguard Worker  **********************************************************************
8*0e209d39SAndroid Build Coastguard Worker  */
9*0e209d39SAndroid Build Coastguard Worker 
10*0e209d39SAndroid Build Coastguard Worker #ifndef __CSRMBCS_H
11*0e209d39SAndroid Build Coastguard Worker #define __CSRMBCS_H
12*0e209d39SAndroid Build Coastguard Worker 
13*0e209d39SAndroid Build Coastguard Worker #include "unicode/utypes.h"
14*0e209d39SAndroid Build Coastguard Worker 
15*0e209d39SAndroid Build Coastguard Worker #if !UCONFIG_NO_CONVERSION
16*0e209d39SAndroid Build Coastguard Worker 
17*0e209d39SAndroid Build Coastguard Worker #include "csrecog.h"
18*0e209d39SAndroid Build Coastguard Worker 
19*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_BEGIN
20*0e209d39SAndroid Build Coastguard Worker 
21*0e209d39SAndroid Build Coastguard Worker // "Character"  iterated character class.
22*0e209d39SAndroid Build Coastguard Worker //    Recognizers for specific mbcs encodings make their "characters" available
23*0e209d39SAndroid Build Coastguard Worker //    by providing a nextChar() function that fills in an instance of IteratedChar
24*0e209d39SAndroid Build Coastguard Worker //    with the next char from the input.
25*0e209d39SAndroid Build Coastguard Worker //    The returned characters are not converted to Unicode, but remain as the raw
26*0e209d39SAndroid Build Coastguard Worker //    bytes (concatenated into an int) from the codepage data.
27*0e209d39SAndroid Build Coastguard Worker //
28*0e209d39SAndroid Build Coastguard Worker //  For Asian charsets, use the raw input rather than the input that has been
29*0e209d39SAndroid Build Coastguard Worker //   stripped of markup.  Detection only considers multi-byte chars, effectively
30*0e209d39SAndroid Build Coastguard Worker //   stripping markup anyway, and double byte chars do occur in markup too.
31*0e209d39SAndroid Build Coastguard Worker //
32*0e209d39SAndroid Build Coastguard Worker class IteratedChar : public UMemory
33*0e209d39SAndroid Build Coastguard Worker {
34*0e209d39SAndroid Build Coastguard Worker public:
35*0e209d39SAndroid Build Coastguard Worker     uint32_t charValue;             // 1-4 bytes from the raw input data
36*0e209d39SAndroid Build Coastguard Worker     int32_t  index;
37*0e209d39SAndroid Build Coastguard Worker     int32_t  nextIndex;
38*0e209d39SAndroid Build Coastguard Worker     UBool    error;
39*0e209d39SAndroid Build Coastguard Worker     UBool    done;
40*0e209d39SAndroid Build Coastguard Worker 
41*0e209d39SAndroid Build Coastguard Worker public:
42*0e209d39SAndroid Build Coastguard Worker     IteratedChar();
43*0e209d39SAndroid Build Coastguard Worker     //void reset();
44*0e209d39SAndroid Build Coastguard Worker     int32_t nextByte(InputText* det);
45*0e209d39SAndroid Build Coastguard Worker };
46*0e209d39SAndroid Build Coastguard Worker 
47*0e209d39SAndroid Build Coastguard Worker 
48*0e209d39SAndroid Build Coastguard Worker class CharsetRecog_mbcs : public CharsetRecognizer {
49*0e209d39SAndroid Build Coastguard Worker 
50*0e209d39SAndroid Build Coastguard Worker protected:
51*0e209d39SAndroid Build Coastguard Worker     /**
52*0e209d39SAndroid Build Coastguard Worker      * Test the match of this charset with the input text data
53*0e209d39SAndroid Build Coastguard Worker      *      which is obtained via the CharsetDetector object.
54*0e209d39SAndroid Build Coastguard Worker      *
55*0e209d39SAndroid Build Coastguard Worker      * @param det  The CharsetDetector, which contains the input text
56*0e209d39SAndroid Build Coastguard Worker      *             to be checked for being in this charset.
57*0e209d39SAndroid Build Coastguard Worker      * @return     Two values packed into one int  (Damn java, anyhow)
58*0e209d39SAndroid Build Coastguard Worker      *             <br/>
59*0e209d39SAndroid Build Coastguard Worker      *             bits 0-7:  the match confidence, ranging from 0-100
60*0e209d39SAndroid Build Coastguard Worker      *             <br/>
61*0e209d39SAndroid Build Coastguard Worker      *             bits 8-15: The match reason, an enum-like value.
62*0e209d39SAndroid Build Coastguard Worker      */
63*0e209d39SAndroid Build Coastguard Worker     int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const;
64*0e209d39SAndroid Build Coastguard Worker 
65*0e209d39SAndroid Build Coastguard Worker public:
66*0e209d39SAndroid Build Coastguard Worker 
67*0e209d39SAndroid Build Coastguard Worker     virtual ~CharsetRecog_mbcs();
68*0e209d39SAndroid Build Coastguard Worker 
69*0e209d39SAndroid Build Coastguard Worker     /**
70*0e209d39SAndroid Build Coastguard Worker      * Get the IANA name of this charset.
71*0e209d39SAndroid Build Coastguard Worker      * @return the charset name.
72*0e209d39SAndroid Build Coastguard Worker      */
73*0e209d39SAndroid Build Coastguard Worker 
74*0e209d39SAndroid Build Coastguard Worker     const char *getName() const override = 0;
75*0e209d39SAndroid Build Coastguard Worker     const char *getLanguage() const override = 0;
76*0e209d39SAndroid Build Coastguard Worker     UBool match(InputText* input, CharsetMatch *results) const override = 0;
77*0e209d39SAndroid Build Coastguard Worker 
78*0e209d39SAndroid Build Coastguard Worker     /**
79*0e209d39SAndroid Build Coastguard Worker      * Get the next character (however many bytes it is) from the input data
80*0e209d39SAndroid Build Coastguard Worker      *    Subclasses for specific charset encodings must implement this function
81*0e209d39SAndroid Build Coastguard Worker      *    to get characters according to the rules of their encoding scheme.
82*0e209d39SAndroid Build Coastguard Worker      *
83*0e209d39SAndroid Build Coastguard Worker      *  This function is not a method of class IteratedChar only because
84*0e209d39SAndroid Build Coastguard Worker      *   that would require a lot of extra derived classes, which is awkward.
85*0e209d39SAndroid Build Coastguard Worker      * @param it  The IteratedChar "struct" into which the returned char is placed.
86*0e209d39SAndroid Build Coastguard Worker      * @param det The charset detector, which is needed to get at the input byte data
87*0e209d39SAndroid Build Coastguard Worker      *            being iterated over.
88*0e209d39SAndroid Build Coastguard Worker      * @return    True if a character was returned, false at end of input.
89*0e209d39SAndroid Build Coastguard Worker      */
90*0e209d39SAndroid Build Coastguard Worker     virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0;
91*0e209d39SAndroid Build Coastguard Worker 
92*0e209d39SAndroid Build Coastguard Worker };
93*0e209d39SAndroid Build Coastguard Worker 
94*0e209d39SAndroid Build Coastguard Worker 
95*0e209d39SAndroid Build Coastguard Worker /**
96*0e209d39SAndroid Build Coastguard Worker  *   Shift-JIS charset recognizer.
97*0e209d39SAndroid Build Coastguard Worker  *
98*0e209d39SAndroid Build Coastguard Worker  */
99*0e209d39SAndroid Build Coastguard Worker class CharsetRecog_sjis : public CharsetRecog_mbcs {
100*0e209d39SAndroid Build Coastguard Worker public:
101*0e209d39SAndroid Build Coastguard Worker     virtual ~CharsetRecog_sjis();
102*0e209d39SAndroid Build Coastguard Worker 
103*0e209d39SAndroid Build Coastguard Worker     UBool nextChar(IteratedChar *it, InputText *det) const override;
104*0e209d39SAndroid Build Coastguard Worker 
105*0e209d39SAndroid Build Coastguard Worker     UBool match(InputText* input, CharsetMatch *results) const override;
106*0e209d39SAndroid Build Coastguard Worker 
107*0e209d39SAndroid Build Coastguard Worker     const char *getName() const override;
108*0e209d39SAndroid Build Coastguard Worker     const char *getLanguage() const override;
109*0e209d39SAndroid Build Coastguard Worker 
110*0e209d39SAndroid Build Coastguard Worker };
111*0e209d39SAndroid Build Coastguard Worker 
112*0e209d39SAndroid Build Coastguard Worker 
113*0e209d39SAndroid Build Coastguard Worker /**
114*0e209d39SAndroid Build Coastguard Worker  *   EUC charset recognizers.  One abstract class that provides the common function
115*0e209d39SAndroid Build Coastguard Worker  *             for getting the next character according to the EUC encoding scheme,
116*0e209d39SAndroid Build Coastguard Worker  *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
117*0e209d39SAndroid Build Coastguard Worker  *
118*0e209d39SAndroid Build Coastguard Worker  */
119*0e209d39SAndroid Build Coastguard Worker class CharsetRecog_euc : public CharsetRecog_mbcs
120*0e209d39SAndroid Build Coastguard Worker {
121*0e209d39SAndroid Build Coastguard Worker public:
122*0e209d39SAndroid Build Coastguard Worker     virtual ~CharsetRecog_euc();
123*0e209d39SAndroid Build Coastguard Worker 
124*0e209d39SAndroid Build Coastguard Worker     const char *getName() const override = 0;
125*0e209d39SAndroid Build Coastguard Worker     const char *getLanguage() const override = 0;
126*0e209d39SAndroid Build Coastguard Worker 
127*0e209d39SAndroid Build Coastguard Worker     UBool match(InputText* input, CharsetMatch *results) const override = 0;
128*0e209d39SAndroid Build Coastguard Worker     /*
129*0e209d39SAndroid Build Coastguard Worker      *  (non-Javadoc)
130*0e209d39SAndroid Build Coastguard Worker      *  Get the next character value for EUC based encodings.
131*0e209d39SAndroid Build Coastguard Worker      *  Character "value" is simply the raw bytes that make up the character
132*0e209d39SAndroid Build Coastguard Worker      *     packed into an int.
133*0e209d39SAndroid Build Coastguard Worker      */
134*0e209d39SAndroid Build Coastguard Worker     UBool nextChar(IteratedChar *it, InputText *det) const override;
135*0e209d39SAndroid Build Coastguard Worker };
136*0e209d39SAndroid Build Coastguard Worker 
137*0e209d39SAndroid Build Coastguard Worker /**
138*0e209d39SAndroid Build Coastguard Worker  * The charset recognize for EUC-JP.  A singleton instance of this class
139*0e209d39SAndroid Build Coastguard Worker  *    is created and kept by the public CharsetDetector class
140*0e209d39SAndroid Build Coastguard Worker  */
141*0e209d39SAndroid Build Coastguard Worker class CharsetRecog_euc_jp : public CharsetRecog_euc
142*0e209d39SAndroid Build Coastguard Worker {
143*0e209d39SAndroid Build Coastguard Worker public:
144*0e209d39SAndroid Build Coastguard Worker     virtual ~CharsetRecog_euc_jp();
145*0e209d39SAndroid Build Coastguard Worker 
146*0e209d39SAndroid Build Coastguard Worker     const char *getName() const override;
147*0e209d39SAndroid Build Coastguard Worker     const char *getLanguage() const override;
148*0e209d39SAndroid Build Coastguard Worker 
149*0e209d39SAndroid Build Coastguard Worker     UBool match(InputText* input, CharsetMatch *results) const override;
150*0e209d39SAndroid Build Coastguard Worker };
151*0e209d39SAndroid Build Coastguard Worker 
152*0e209d39SAndroid Build Coastguard Worker /**
153*0e209d39SAndroid Build Coastguard Worker  * The charset recognize for EUC-KR.  A singleton instance of this class
154*0e209d39SAndroid Build Coastguard Worker  *    is created and kept by the public CharsetDetector class
155*0e209d39SAndroid Build Coastguard Worker  */
156*0e209d39SAndroid Build Coastguard Worker class CharsetRecog_euc_kr : public CharsetRecog_euc
157*0e209d39SAndroid Build Coastguard Worker {
158*0e209d39SAndroid Build Coastguard Worker public:
159*0e209d39SAndroid Build Coastguard Worker     virtual ~CharsetRecog_euc_kr();
160*0e209d39SAndroid Build Coastguard Worker 
161*0e209d39SAndroid Build Coastguard Worker     const char *getName() const override;
162*0e209d39SAndroid Build Coastguard Worker     const char *getLanguage() const override;
163*0e209d39SAndroid Build Coastguard Worker 
164*0e209d39SAndroid Build Coastguard Worker     UBool match(InputText* input, CharsetMatch *results) const override;
165*0e209d39SAndroid Build Coastguard Worker };
166*0e209d39SAndroid Build Coastguard Worker 
167*0e209d39SAndroid Build Coastguard Worker /**
168*0e209d39SAndroid Build Coastguard Worker  *
169*0e209d39SAndroid Build Coastguard Worker  *   Big5 charset recognizer.
170*0e209d39SAndroid Build Coastguard Worker  *
171*0e209d39SAndroid Build Coastguard Worker  */
172*0e209d39SAndroid Build Coastguard Worker class CharsetRecog_big5 : public CharsetRecog_mbcs
173*0e209d39SAndroid Build Coastguard Worker {
174*0e209d39SAndroid Build Coastguard Worker public:
175*0e209d39SAndroid Build Coastguard Worker     virtual ~CharsetRecog_big5();
176*0e209d39SAndroid Build Coastguard Worker 
177*0e209d39SAndroid Build Coastguard Worker     UBool nextChar(IteratedChar* it, InputText* det) const override;
178*0e209d39SAndroid Build Coastguard Worker 
179*0e209d39SAndroid Build Coastguard Worker     const char *getName() const override;
180*0e209d39SAndroid Build Coastguard Worker     const char *getLanguage() const override;
181*0e209d39SAndroid Build Coastguard Worker 
182*0e209d39SAndroid Build Coastguard Worker     UBool match(InputText* input, CharsetMatch *results) const override;
183*0e209d39SAndroid Build Coastguard Worker };
184*0e209d39SAndroid Build Coastguard Worker 
185*0e209d39SAndroid Build Coastguard Worker 
186*0e209d39SAndroid Build Coastguard Worker /**
187*0e209d39SAndroid Build Coastguard Worker  *
188*0e209d39SAndroid Build Coastguard Worker  *   GB-18030 recognizer. Uses simplified Chinese statistics.
189*0e209d39SAndroid Build Coastguard Worker  *
190*0e209d39SAndroid Build Coastguard Worker  */
191*0e209d39SAndroid Build Coastguard Worker class CharsetRecog_gb_18030 : public CharsetRecog_mbcs
192*0e209d39SAndroid Build Coastguard Worker {
193*0e209d39SAndroid Build Coastguard Worker public:
194*0e209d39SAndroid Build Coastguard Worker     virtual ~CharsetRecog_gb_18030();
195*0e209d39SAndroid Build Coastguard Worker 
196*0e209d39SAndroid Build Coastguard Worker     UBool nextChar(IteratedChar* it, InputText* det) const override;
197*0e209d39SAndroid Build Coastguard Worker 
198*0e209d39SAndroid Build Coastguard Worker     const char *getName() const override;
199*0e209d39SAndroid Build Coastguard Worker     const char *getLanguage() const override;
200*0e209d39SAndroid Build Coastguard Worker 
201*0e209d39SAndroid Build Coastguard Worker     UBool match(InputText* input, CharsetMatch *results) const override;
202*0e209d39SAndroid Build Coastguard Worker };
203*0e209d39SAndroid Build Coastguard Worker 
204*0e209d39SAndroid Build Coastguard Worker U_NAMESPACE_END
205*0e209d39SAndroid Build Coastguard Worker 
206*0e209d39SAndroid Build Coastguard Worker #endif
207*0e209d39SAndroid Build Coastguard Worker #endif /* __CSRMBCS_H */
208