xref: /aosp_15_r20/external/fbjni/cxx/fbjni/detail/utf8.h (revision 65c59e023c5336bbd4a23be7af78407e3d80e7e7)
1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cstdint>
20 #include <string>
21 
22 #include <jni.h>
23 
24 namespace facebook {
25 namespace jni {
26 
27 namespace detail {
28 
29 void utf8ToModifiedUTF8(
30     const uint8_t* bytes,
31     size_t len,
32     uint8_t* modified,
33     size_t modifiedLength);
34 size_t modifiedLength(const std::string& str);
35 size_t modifiedLength(const uint8_t* str, size_t* length);
36 std::string modifiedUTF8ToUTF8(const uint8_t* modified, size_t len) noexcept;
37 std::string utf16toUTF8(const uint16_t* utf16Bytes, size_t len) noexcept;
38 
39 } // namespace detail
40 
41 // JNI represents strings encoded with modified version of UTF-8.  The
42 // difference between UTF-8 and Modified UTF-8 is that the latter support only
43 // 1-byte, 2-byte, and 3-byte formats. Supplementary character (4 bytes in
44 // unicode) needs to be represented in the form of surrogate pairs. To create a
45 // Modified UTF-8 surrogate pair that Dalvik would understand we take 4-byte
46 // unicode character, encode it with UTF-16 which gives us two 2 byte chars
47 // (surrogate pair) and then we encode each pair as UTF-8. This result in 2 x 3
48 // byte characters.  To convert modified UTF-8 to standard UTF-8, this mus tbe
49 // reversed.
50 //
51 // The second difference is that Modified UTF-8 is encoding NUL byte in 2-byte
52 // format.
53 //
54 // In order to avoid complex error handling, only a minimum of validity checking
55 // is done to avoid crashing.  If the input is invalid, the output may be
56 // invalid as well.
57 //
58 // Relevant links:
59 //  -
60 //  http://docs.oracle.com/javase/7/docs/technotes/guides/jni/spec/functions.html
61 //  -
62 //  https://docs.oracle.com/javase/6/docs/api/java/io/DataInput.html#modified-utf-8
63 
64 // JString to UTF16 extractor using RAII idiom. Note that the
65 // ctor/dtor use GetStringCritical/ReleaseStringCritical, so this
66 // class is subject to the restrictions imposed by those functions.
67 class JStringUtf16Extractor {
68  public:
JStringUtf16Extractor(JNIEnv * env,jstring javaString)69   JStringUtf16Extractor(JNIEnv* env, jstring javaString)
70       : env_(env), javaString_(javaString), length_(0), utf16String_(nullptr) {
71     if (env_ && javaString_) {
72       length_ = env_->GetStringLength(javaString_);
73       utf16String_ = env_->GetStringCritical(javaString_, nullptr);
74     }
75   }
76 
~JStringUtf16Extractor()77   ~JStringUtf16Extractor() {
78     if (utf16String_) {
79       env_->ReleaseStringCritical(javaString_, utf16String_);
80     }
81   }
82 
length()83   jsize length() const {
84     return length_;
85   }
86 
chars()87   const jchar* chars() const {
88     return utf16String_;
89   }
90 
91  private:
92   JNIEnv* env_;
93   jstring javaString_;
94   jsize length_;
95   const jchar* utf16String_;
96 };
97 
98 } // namespace jni
99 } // namespace facebook
100