xref: /aosp_15_r20/external/flac/src/share/utf8/iconvert.c (revision 600f14f40d737144c998e2ec7a483122d3776fbc)
1*600f14f4SXin Li /*
2*600f14f4SXin Li  * Copyright (C) 2001 Edmund Grimley Evans <[email protected]>
3*600f14f4SXin Li  *
4*600f14f4SXin Li  * This program is free software; you can redistribute it and/or modify
5*600f14f4SXin Li  * it under the terms of the GNU General Public License as published by
6*600f14f4SXin Li  * the Free Software Foundation; either version 2 of the License, or
7*600f14f4SXin Li  * (at your option) any later version.
8*600f14f4SXin Li  *
9*600f14f4SXin Li  * This program is distributed in the hope that it will be useful,
10*600f14f4SXin Li  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11*600f14f4SXin Li  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12*600f14f4SXin Li  * GNU General Public License for more details.
13*600f14f4SXin Li  *
14*600f14f4SXin Li  * You should have received a copy of the GNU General Public License along
15*600f14f4SXin Li  * with this program; if not, write to the Free Software Foundation, Inc.,
16*600f14f4SXin Li  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
17*600f14f4SXin Li  */
18*600f14f4SXin Li 
19*600f14f4SXin Li #ifdef HAVE_CONFIG_H
20*600f14f4SXin Li #  include <config.h>
21*600f14f4SXin Li #endif
22*600f14f4SXin Li 
23*600f14f4SXin Li #if !defined _WIN32 && defined HAVE_ICONV
24*600f14f4SXin Li 
25*600f14f4SXin Li #include <assert.h>
26*600f14f4SXin Li #include <errno.h>
27*600f14f4SXin Li #include <iconv.h>
28*600f14f4SXin Li #include <stdio.h>
29*600f14f4SXin Li #include <stdlib.h>
30*600f14f4SXin Li #include <string.h>
31*600f14f4SXin Li 
32*600f14f4SXin Li #include "iconvert.h"
33*600f14f4SXin Li #include "share/alloc.h"
34*600f14f4SXin Li #include "share/safe_str.h"
35*600f14f4SXin Li 
36*600f14f4SXin Li /*
37*600f14f4SXin Li  * Convert data from one encoding to another. Return:
38*600f14f4SXin Li  *
39*600f14f4SXin Li  *  -2 : memory allocation failed
40*600f14f4SXin Li  *  -1 : unknown encoding
41*600f14f4SXin Li  *   0 : data was converted exactly
42*600f14f4SXin Li  *   1 : data was converted inexactly
43*600f14f4SXin Li  *   2 : data was invalid (but still converted)
44*600f14f4SXin Li  *
45*600f14f4SXin Li  * We convert in two steps, via UTF-8, as this is the only
46*600f14f4SXin Li  * reliable way of distinguishing between invalid input
47*600f14f4SXin Li  * and valid input which iconv refuses to transliterate.
48*600f14f4SXin Li  * We convert from UTF-8 twice, because we have no way of
49*600f14f4SXin Li  * knowing whether the conversion was exact if iconv returns
50*600f14f4SXin Li  * E2BIG (due to a bug in the specification of iconv).
51*600f14f4SXin Li  * An alternative approach is to assume that the output of
52*600f14f4SXin Li  * iconv is never more than 4 times as long as the input,
53*600f14f4SXin Li  * but I prefer to avoid that assumption if possible.
54*600f14f4SXin Li  */
55*600f14f4SXin Li 
iconvert(const char * fromcode,const char * tocode,const char * from,size_t fromlen,char ** to,size_t * tolen)56*600f14f4SXin Li int iconvert(const char *fromcode, const char *tocode,
57*600f14f4SXin Li 	     const char *from, size_t fromlen,
58*600f14f4SXin Li 	     char **to, size_t *tolen)
59*600f14f4SXin Li {
60*600f14f4SXin Li   int ret = 0;
61*600f14f4SXin Li   iconv_t cd1, cd2;
62*600f14f4SXin Li   char *ib;
63*600f14f4SXin Li   char *ob;
64*600f14f4SXin Li   char *utfbuf = 0, *outbuf, *newbuf;
65*600f14f4SXin Li   size_t utflen, outlen, ibl, obl, obp, k;
66*600f14f4SXin Li   char tbuf[2048];
67*600f14f4SXin Li 
68*600f14f4SXin Li   cd1 = iconv_open("UTF-8", fromcode);
69*600f14f4SXin Li   if (cd1 == (iconv_t)(-1))
70*600f14f4SXin Li     return -1;
71*600f14f4SXin Li 
72*600f14f4SXin Li   cd2 = (iconv_t)(-1);
73*600f14f4SXin Li   /* Don't use strcasecmp() as it's locale-dependent. */
74*600f14f4SXin Li   if (!strchr("Uu", tocode[0]) ||
75*600f14f4SXin Li       !strchr("Tt", tocode[1]) ||
76*600f14f4SXin Li       !strchr("Ff", tocode[2]) ||
77*600f14f4SXin Li       tocode[3] != '-' ||
78*600f14f4SXin Li       tocode[4] != '8' ||
79*600f14f4SXin Li       tocode[5] != '\0') {
80*600f14f4SXin Li     char *tocode1;
81*600f14f4SXin Li     int rc;
82*600f14f4SXin Li     /*
83*600f14f4SXin Li      * Try using this non-standard feature of glibc and libiconv.
84*600f14f4SXin Li      * This is deliberately not a config option as people often
85*600f14f4SXin Li      * change their iconv library without rebuilding applications.
86*600f14f4SXin Li      */
87*600f14f4SXin Li 
88*600f14f4SXin Li     rc = asprintf(&tocode1, "%s//TRANSLIT", tocode);
89*600f14f4SXin Li     if (rc < 0 || ! tocode1)
90*600f14f4SXin Li       goto fail;
91*600f14f4SXin Li 
92*600f14f4SXin Li     cd2 = iconv_open(tocode1, "UTF-8");
93*600f14f4SXin Li     free(tocode1);
94*600f14f4SXin Li 
95*600f14f4SXin Li     if (cd2 == (iconv_t)(-1))
96*600f14f4SXin Li       cd2 = iconv_open(tocode, fromcode);
97*600f14f4SXin Li 
98*600f14f4SXin Li     if (cd2 == (iconv_t)(-1)) {
99*600f14f4SXin Li       iconv_close(cd1);
100*600f14f4SXin Li       return -1;
101*600f14f4SXin Li     }
102*600f14f4SXin Li   }
103*600f14f4SXin Li 
104*600f14f4SXin Li   utflen = 1; /*fromlen * 2 + 1; XXX */
105*600f14f4SXin Li   utfbuf = malloc(utflen);
106*600f14f4SXin Li   if (!utfbuf)
107*600f14f4SXin Li     goto fail;
108*600f14f4SXin Li 
109*600f14f4SXin Li   /* Convert to UTF-8 */
110*600f14f4SXin Li   ib = (char *)from;
111*600f14f4SXin Li   ibl = fromlen;
112*600f14f4SXin Li   ob = utfbuf;
113*600f14f4SXin Li   obl = utflen;
114*600f14f4SXin Li   for (;;) {
115*600f14f4SXin Li     k = iconv(cd1, &ib, &ibl, &ob, &obl);
116*600f14f4SXin Li     assert((!k && !ibl) ||
117*600f14f4SXin Li 	   (k == (size_t)(-1) && errno == E2BIG && ibl && obl < 6) ||
118*600f14f4SXin Li 	   (k == (size_t)(-1) &&
119*600f14f4SXin Li 	    (errno == EILSEQ || errno == EINVAL) && ibl));
120*600f14f4SXin Li     if (!ibl)
121*600f14f4SXin Li       break;
122*600f14f4SXin Li     if (obl < 6) {
123*600f14f4SXin Li       /* Enlarge the buffer */
124*600f14f4SXin Li       if(utflen*2 < utflen) /* overflow check */
125*600f14f4SXin Li 	goto fail;
126*600f14f4SXin Li       utflen *= 2;
127*600f14f4SXin Li       obp = ob - utfbuf; /* save position */
128*600f14f4SXin Li       newbuf = realloc(utfbuf, utflen);
129*600f14f4SXin Li       if (!newbuf)
130*600f14f4SXin Li 	goto fail;
131*600f14f4SXin Li       ob = newbuf + obp;
132*600f14f4SXin Li       obl = utflen - obp;
133*600f14f4SXin Li       utfbuf = newbuf;
134*600f14f4SXin Li     }
135*600f14f4SXin Li     else {
136*600f14f4SXin Li       /* Invalid input */
137*600f14f4SXin Li       ib++, ibl--;
138*600f14f4SXin Li       *ob++ = '#', obl--;
139*600f14f4SXin Li       ret = 2;
140*600f14f4SXin Li       iconv(cd1, 0, 0, 0, 0);
141*600f14f4SXin Li     }
142*600f14f4SXin Li   }
143*600f14f4SXin Li 
144*600f14f4SXin Li   if (cd2 == (iconv_t)(-1)) {
145*600f14f4SXin Li     /* The target encoding was UTF-8 */
146*600f14f4SXin Li     if (tolen)
147*600f14f4SXin Li       *tolen = ob - utfbuf;
148*600f14f4SXin Li     if (!to) {
149*600f14f4SXin Li       free(utfbuf);
150*600f14f4SXin Li       iconv_close(cd1);
151*600f14f4SXin Li       return ret;
152*600f14f4SXin Li     }
153*600f14f4SXin Li     newbuf = safe_realloc_nofree_add_2op_(utfbuf, (ob - utfbuf), /*+*/1);
154*600f14f4SXin Li     if (!newbuf)
155*600f14f4SXin Li       goto fail;
156*600f14f4SXin Li     ob = (ob - utfbuf) + newbuf;
157*600f14f4SXin Li     *ob = '\0';
158*600f14f4SXin Li     *to = newbuf;
159*600f14f4SXin Li     iconv_close(cd1);
160*600f14f4SXin Li     return ret;
161*600f14f4SXin Li   }
162*600f14f4SXin Li 
163*600f14f4SXin Li   /* Truncate the buffer to be tidy */
164*600f14f4SXin Li   utflen = ob - utfbuf;
165*600f14f4SXin Li   if (utflen == 0)
166*600f14f4SXin Li     goto fail;
167*600f14f4SXin Li   newbuf = realloc(utfbuf, utflen);
168*600f14f4SXin Li   if (!newbuf)
169*600f14f4SXin Li     goto fail;
170*600f14f4SXin Li   utfbuf = newbuf;
171*600f14f4SXin Li 
172*600f14f4SXin Li   /* Convert from UTF-8 to discover how long the output is */
173*600f14f4SXin Li   outlen = 0;
174*600f14f4SXin Li   ib = utfbuf;
175*600f14f4SXin Li   ibl = utflen;
176*600f14f4SXin Li   while (ibl) {
177*600f14f4SXin Li     ob = tbuf;
178*600f14f4SXin Li     obl = sizeof(tbuf);
179*600f14f4SXin Li     k = iconv(cd2, &ib, &ibl, &ob, &obl);
180*600f14f4SXin Li     assert((k != (size_t)(-1) && !ibl) ||
181*600f14f4SXin Li 	   (k == (size_t)(-1) && errno == E2BIG && ibl) ||
182*600f14f4SXin Li 	   (k == (size_t)(-1) && errno == EILSEQ && ibl));
183*600f14f4SXin Li     if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
184*600f14f4SXin Li       /* Replace one character */
185*600f14f4SXin Li       char *tb = "?";
186*600f14f4SXin Li       size_t tbl = 1;
187*600f14f4SXin Li 
188*600f14f4SXin Li       outlen += ob - tbuf;
189*600f14f4SXin Li       ob = tbuf;
190*600f14f4SXin Li       obl = sizeof(tbuf);
191*600f14f4SXin Li       k = iconv(cd2, &tb, &tbl, &ob, &obl);
192*600f14f4SXin Li       assert((!k && !tbl) ||
193*600f14f4SXin Li 	     (k == (size_t)(-1) && errno == EILSEQ && tbl));
194*600f14f4SXin Li       for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
195*600f14f4SXin Li 	;
196*600f14f4SXin Li     }
197*600f14f4SXin Li     outlen += ob - tbuf;
198*600f14f4SXin Li   }
199*600f14f4SXin Li   ob = tbuf;
200*600f14f4SXin Li   obl = sizeof(tbuf);
201*600f14f4SXin Li   k = iconv(cd2, 0, 0, &ob, &obl);
202*600f14f4SXin Li   assert(!k);
203*600f14f4SXin Li   outlen += ob - tbuf;
204*600f14f4SXin Li 
205*600f14f4SXin Li   /* Convert from UTF-8 for real */
206*600f14f4SXin Li   outbuf = safe_malloc_add_2op_(outlen, /*+*/1);
207*600f14f4SXin Li   if (!outbuf)
208*600f14f4SXin Li     goto fail;
209*600f14f4SXin Li   ib = utfbuf;
210*600f14f4SXin Li   ibl = utflen;
211*600f14f4SXin Li   ob = outbuf;
212*600f14f4SXin Li   obl = outlen;
213*600f14f4SXin Li   while (ibl) {
214*600f14f4SXin Li     k = iconv(cd2, &ib, &ibl, &ob, &obl);
215*600f14f4SXin Li     assert((k != (size_t)(-1) && !ibl) ||
216*600f14f4SXin Li 	   (k == (size_t)(-1) && errno == EILSEQ && ibl));
217*600f14f4SXin Li     if (k && !ret)
218*600f14f4SXin Li       ret = 1;
219*600f14f4SXin Li     if (ibl && !(k == (size_t)(-1) && errno == E2BIG)) {
220*600f14f4SXin Li       /* Replace one character */
221*600f14f4SXin Li       char *tb = "?";
222*600f14f4SXin Li       size_t tbl = 1;
223*600f14f4SXin Li 
224*600f14f4SXin Li       k = iconv(cd2, &tb, &tbl, &ob, &obl);
225*600f14f4SXin Li       assert((!k && !tbl) ||
226*600f14f4SXin Li 	     (k == (size_t)(-1) && errno == EILSEQ && tbl));
227*600f14f4SXin Li       for (++ib, --ibl; ibl && (*ib & 0x80); ib++, ibl--)
228*600f14f4SXin Li 	;
229*600f14f4SXin Li     }
230*600f14f4SXin Li   }
231*600f14f4SXin Li   k = iconv(cd2, 0, 0, &ob, &obl);
232*600f14f4SXin Li   assert(!k);
233*600f14f4SXin Li   assert(!obl);
234*600f14f4SXin Li   *ob = '\0';
235*600f14f4SXin Li 
236*600f14f4SXin Li   free(utfbuf);
237*600f14f4SXin Li   iconv_close(cd1);
238*600f14f4SXin Li   iconv_close(cd2);
239*600f14f4SXin Li   if (tolen)
240*600f14f4SXin Li     *tolen = outlen;
241*600f14f4SXin Li   if (!to) {
242*600f14f4SXin Li     free(outbuf);
243*600f14f4SXin Li     return ret;
244*600f14f4SXin Li   }
245*600f14f4SXin Li   *to = outbuf;
246*600f14f4SXin Li   return ret;
247*600f14f4SXin Li 
248*600f14f4SXin Li  fail:
249*600f14f4SXin Li   if(0 != utfbuf)
250*600f14f4SXin Li     free(utfbuf);
251*600f14f4SXin Li   iconv_close(cd1);
252*600f14f4SXin Li   if (cd2 != (iconv_t)(-1))
253*600f14f4SXin Li     iconv_close(cd2);
254*600f14f4SXin Li   return -2;
255*600f14f4SXin Li }
256*600f14f4SXin Li 
257*600f14f4SXin Li #endif /* HAVE_ICONV */
258