xref: /aosp_15_r20/external/cronet/third_party/boringssl/src/crypto/bytestring/unicode.c (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 /* Copyright (c) 2018, Google Inc.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10  * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12  * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13  * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14 
15 #include <openssl/bytestring.h>
16 
17 #include "internal.h"
18 
19 
is_valid_code_point(uint32_t v)20 static int is_valid_code_point(uint32_t v) {
21   // References in the following are to Unicode 15.0.0.
22   if (// The Unicode space runs from zero to 0x10ffff (3.4 D9).
23       v > 0x10ffff ||
24       // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved
25       // as noncharacters (3.4 D14). See also 23.7. As our APIs are intended for
26       // "open interchange", such as ASN.1, we reject them.
27       (v & 0xfffe) == 0xfffe ||
28       (v >= 0xfdd0 && v <= 0xfdef) ||
29       // Surrogate code points are invalid (3.2 C1).
30       (v >= 0xd800 && v <= 0xdfff)) {
31     return 0;
32   }
33   return 1;
34 }
35 
36 // BOTTOM_BITS returns a byte with the bottom |n| bits set.
37 #define BOTTOM_BITS(n) (uint8_t)((1u << (n)) - 1)
38 
39 // TOP_BITS returns a byte with the top |n| bits set.
40 #define TOP_BITS(n) ((uint8_t)~BOTTOM_BITS(8 - (n)))
41 
CBS_get_utf8(CBS * cbs,uint32_t * out)42 int CBS_get_utf8(CBS *cbs, uint32_t *out) {
43   uint8_t c;
44   if (!CBS_get_u8(cbs, &c)) {
45     return 0;
46   }
47   if (c <= 0x7f) {
48     *out = c;
49     return 1;
50   }
51   uint32_t v, lower_bound;
52   size_t len;
53   if ((c & TOP_BITS(3)) == TOP_BITS(2)) {
54     v = c & BOTTOM_BITS(5);
55     len = 1;
56     lower_bound = 0x80;
57   } else if ((c & TOP_BITS(4)) == TOP_BITS(3)) {
58     v = c & BOTTOM_BITS(4);
59     len = 2;
60     lower_bound = 0x800;
61   } else if ((c & TOP_BITS(5)) == TOP_BITS(4)) {
62     v = c & BOTTOM_BITS(3);
63     len = 3;
64     lower_bound = 0x10000;
65   } else {
66     return 0;
67   }
68   for (size_t i = 0; i < len; i++) {
69     if (!CBS_get_u8(cbs, &c) ||
70         (c & TOP_BITS(2)) != TOP_BITS(1)) {
71       return 0;
72     }
73     v <<= 6;
74     v |= c & BOTTOM_BITS(6);
75   }
76   if (!is_valid_code_point(v) ||
77       v < lower_bound) {
78     return 0;
79   }
80   *out = v;
81   return 1;
82 }
83 
CBS_get_latin1(CBS * cbs,uint32_t * out)84 int CBS_get_latin1(CBS *cbs, uint32_t *out) {
85   uint8_t c;
86   if (!CBS_get_u8(cbs, &c)) {
87     return 0;
88   }
89   *out = c;
90   return 1;
91 }
92 
CBS_get_ucs2_be(CBS * cbs,uint32_t * out)93 int CBS_get_ucs2_be(CBS *cbs, uint32_t *out) {
94   // Note UCS-2 (used by BMPString) does not support surrogates.
95   uint16_t c;
96   if (!CBS_get_u16(cbs, &c) ||
97       !is_valid_code_point(c)) {
98     return 0;
99   }
100   *out = c;
101   return 1;
102 }
103 
CBS_get_utf32_be(CBS * cbs,uint32_t * out)104 int CBS_get_utf32_be(CBS *cbs, uint32_t *out) {
105   return CBS_get_u32(cbs, out) && is_valid_code_point(*out);
106 }
107 
CBB_get_utf8_len(uint32_t u)108 size_t CBB_get_utf8_len(uint32_t u) {
109   if (u <= 0x7f) {
110     return 1;
111   }
112   if (u <= 0x7ff) {
113     return 2;
114   }
115   if (u <= 0xffff) {
116     return 3;
117   }
118   return 4;
119 }
120 
CBB_add_utf8(CBB * cbb,uint32_t u)121 int CBB_add_utf8(CBB *cbb, uint32_t u) {
122   if (!is_valid_code_point(u)) {
123     return 0;
124   }
125   if (u <= 0x7f) {
126     return CBB_add_u8(cbb, (uint8_t)u);
127   }
128   if (u <= 0x7ff) {
129     return CBB_add_u8(cbb, TOP_BITS(2) | (u >> 6)) &&
130            CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
131   }
132   if (u <= 0xffff) {
133     return CBB_add_u8(cbb, TOP_BITS(3) | (u >> 12)) &&
134            CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
135            CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
136   }
137   if (u <= 0x10ffff) {
138     return CBB_add_u8(cbb, TOP_BITS(4) | (u >> 18)) &&
139            CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 12) & BOTTOM_BITS(6))) &&
140            CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
141            CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
142   }
143   return 0;
144 }
145 
CBB_add_latin1(CBB * cbb,uint32_t u)146 int CBB_add_latin1(CBB *cbb, uint32_t u) {
147   return u <= 0xff && CBB_add_u8(cbb, (uint8_t)u);
148 }
149 
CBB_add_ucs2_be(CBB * cbb,uint32_t u)150 int CBB_add_ucs2_be(CBB *cbb, uint32_t u) {
151   return u <= 0xffff && is_valid_code_point(u) && CBB_add_u16(cbb, (uint16_t)u);
152 }
153 
CBB_add_utf32_be(CBB * cbb,uint32_t u)154 int CBB_add_utf32_be(CBB *cbb, uint32_t u) {
155   return is_valid_code_point(u) && CBB_add_u32(cbb, u);
156 }
157