1 /*
2  * Copyright (c) 2009-2021, Google LLC
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of Google LLC nor the
13  *       names of its contributors may be used to endorse or promote products
14  *       derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL Google LLC BE LIABLE FOR ANY DIRECT,
20  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #ifndef UPB_LEX_UNICODE_H_
29 #define UPB_LEX_UNICODE_H_
30 
31 // Must be last.
32 #include "upb/port/def.inc"
33 
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37 
38 // Returns true iff a codepoint is the value for a high surrogate.
upb_Unicode_IsHigh(uint32_t cp)39 UPB_INLINE bool upb_Unicode_IsHigh(uint32_t cp) {
40   return (cp >= 0xd800 && cp <= 0xdbff);
41 }
42 
43 // Returns true iff a codepoint is the value for a low surrogate.
upb_Unicode_IsLow(uint32_t cp)44 UPB_INLINE bool upb_Unicode_IsLow(uint32_t cp) {
45   return (cp >= 0xdc00 && cp <= 0xdfff);
46 }
47 
48 // Returns the high 16-bit surrogate value for a supplementary codepoint.
49 // Does not sanity-check the input.
upb_Unicode_ToHigh(uint32_t cp)50 UPB_INLINE uint16_t upb_Unicode_ToHigh(uint32_t cp) {
51   return (cp >> 10) + 0xd7c0;
52 }
53 
54 // Returns the low 16-bit surrogate value for a supplementary codepoint.
55 // Does not sanity-check the input.
upb_Unicode_ToLow(uint32_t cp)56 UPB_INLINE uint16_t upb_Unicode_ToLow(uint32_t cp) {
57   return (cp & 0x3ff) | 0xdc00;
58 }
59 
60 // Returns the 32-bit value corresponding to a pair of 16-bit surrogates.
61 // Does not sanity-check the input.
upb_Unicode_FromPair(uint32_t high,uint32_t low)62 UPB_INLINE uint32_t upb_Unicode_FromPair(uint32_t high, uint32_t low) {
63   return ((high & 0x3ff) << 10) + (low & 0x3ff) + 0x10000;
64 }
65 
66 // Outputs a codepoint as UTF8.
67 // Returns the number of bytes written (1-4 on success, 0 on error).
68 // Does not sanity-check the input. Specifically does not check for surrogates.
69 int upb_Unicode_ToUTF8(uint32_t cp, char* out);
70 
71 #ifdef __cplusplus
72 } /* extern "C" */
73 #endif
74 
75 #include "upb/port/undef.inc"
76 
77 #endif /* UPB_LEX_UNICODE_H_ */
78