1 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
2 // test_utf8_codecvt.cpp
3 
4 // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com .
5 // Use, modification and distribution is subject to the Boost Software
6 // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
7 // http://www.boost.org/LICENSE_1_0.txt)
8 
9 #include <algorithm> // std::copy
10 #include <fstream>
11 #include <iostream>
12 #include <iterator>
13 #include <locale>
14 #include <vector>
15 #include <string>
16 
17 #include <cstddef> // size_t
18 #include <cwchar>
19 #include <boost/config.hpp>
20 #include <boost/core/no_exceptions_support.hpp>
21 
22 #define BOOST_UTF8_BEGIN_NAMESPACE namespace boost { namespace detail {
23 #define BOOST_UTF8_END_NAMESPACE } }
24 #include <boost/detail/utf8_codecvt_facet.hpp>
25 #include <boost/detail/utf8_codecvt_facet.ipp>
26 
27 #if defined(BOOST_NO_STDC_NAMESPACE)
28 namespace std{
29     using ::size_t;
30     using ::wcslen;
31 #if !defined(UNDER_CE) && !defined(__PGIC__)
32     using ::w_int;
33 #endif
34 } // namespace std
35 #endif
36 
37 // Note: copied from boost/iostreams/char_traits.hpp
38 //
39 // Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines
40 // the EOF and WEOF macros to not std:: qualify the wint_t type (and so does
41 // Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope.
42 // NOTE: Use BOOST_WORKAROUND?
43 #if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB))  \
44     || defined(__SUNPRO_CC)
45     using ::std::wint_t;
46 #endif
47 
48 #include <boost/core/lightweight_test.hpp>
49 
50 template<std::size_t s>
51 struct test_data
52 {
53     static unsigned char utf8_encoding[];
54     static wchar_t wchar_encoding[];
55 };
56 
57 template<>
58 unsigned char test_data<2>::utf8_encoding[] = {
59     0x01,
60     0x7f,
61     0xc2, 0x80,
62     0xdf, 0xbf,
63     0xe0, 0xa0, 0x80,
64     0xe7, 0xbf, 0xbf
65 };
66 
67 template<>
68 wchar_t test_data<2>::wchar_encoding[] = {
69     0x0001,
70     0x007f,
71     0x0080,
72     0x07ff,
73     0x0800,
74     0x7fff
75 };
76 
77 template<>
78 unsigned char test_data<4>::utf8_encoding[] = {
79     0x01,
80     0x7f,
81     0xc2, 0x80,
82     0xdf, 0xbf,
83     0xe0, 0xa0, 0x80,
84     0xef, 0xbf, 0xbf,
85     0xf0, 0x90, 0x80, 0x80,
86     0xf4, 0x8f, 0xbf, 0xbf,
87     /* codecvt implementations for clang and gcc don't handle more than 21 bits and
88      * return eof accordlingly.  So don't test the whole 32 range
89      */
90     /*
91     0xf7, 0xbf, 0xbf, 0xbf,
92     0xf8, 0x88, 0x80, 0x80, 0x80,
93     0xfb, 0xbf, 0xbf, 0xbf, 0xbf,
94     0xfc, 0x84, 0x80, 0x80, 0x80, 0x80,
95     0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf
96     */
97 };
98 
99 template<>
100 wchar_t test_data<4>::wchar_encoding[] = {
101     (wchar_t)0x00000001,
102     (wchar_t)0x0000007f,
103     (wchar_t)0x00000080,
104     (wchar_t)0x000007ff,
105     (wchar_t)0x00000800,
106     (wchar_t)0x0000ffff,
107     (wchar_t)0x00010000,
108     (wchar_t)0x0010ffff,
109     /* codecvt implementations for clang and gcc don't handle more than 21 bits and
110      * return eof accordlingly.  So don't test the whole 32 range
111      */
112     /*
113     (wchar_t)0x001fffff,
114     (wchar_t)0x00200000,
115     (wchar_t)0x03ffffff,
116     (wchar_t)0x04000000,
117     (wchar_t)0x7fffffff
118     */
119 };
120 
121 int
test_main(int,char * [])122 test_main(int /* argc */, char * /* argv */[]) {
123     std::locale utf8_locale
124         = std::locale(
125             std::locale::classic(),
126             new boost::detail::utf8_codecvt_facet
127         );
128 
129     typedef char utf8_t;
130     // define test data compatible with the wchar_t implementation
131     // as either ucs-2 or ucs-4 depending on the compiler/library.
132     typedef test_data<sizeof(wchar_t)> td;
133 
134     // Send our test UTF-8 data to file
135     {
136         std::ofstream ofs;
137         ofs.open("test.dat");
138         std::copy(
139             td::utf8_encoding,
140             td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char),
141             std::ostream_iterator<utf8_t>(ofs)
142         );
143     }
144 
145     // Read the test data back in, converting to UCS-4 on the way in
146     std::vector<wchar_t> from_file;
147     {
148         std::wifstream ifs;
149         ifs.imbue(utf8_locale);
150         ifs.open("test.dat");
151 
152         std::wint_t item = 0;
153         // note can't use normal vector from iterator constructor because
154         // dinkumware doesn't have it.
155         for(;;){
156             item = ifs.get();
157             if(item == WEOF)
158                 break;
159             //ifs >> item;
160             //if(ifs.eof())
161             //    break;
162             from_file.push_back(item);
163         }
164     }
165 
166     BOOST_TEST(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding));
167 
168     // Send the UCS4_data back out, converting to UTF-8
169     {
170         std::wofstream ofs;
171         ofs.imbue(utf8_locale);
172         ofs.open("test2.dat");
173         std::copy(
174             from_file.begin(),
175             from_file.end(),
176             std::ostream_iterator<wchar_t, wchar_t>(ofs)
177         );
178     }
179 
180     // Make sure that both files are the same
181     {
182         typedef std::istream_iterator<utf8_t> is_iter;
183         is_iter end_iter;
184 
185         std::ifstream ifs1("test.dat");
186         is_iter it1(ifs1);
187         std::vector<utf8_t> data1;
188         std::copy(it1, end_iter, std::back_inserter(data1));
189 
190         std::ifstream ifs2("test2.dat");
191         is_iter it2(ifs2);
192         std::vector<utf8_t> data2;
193         std::copy(it2, end_iter, std::back_inserter(data2));
194 
195         BOOST_TEST(data1 == data2);
196     }
197 
198     // some libraries have trouble that only shows up with longer strings
199 
200     const wchar_t * test3_data = L"\
201     <?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\
202     <!DOCTYPE boost_serialization>\
203     <boost_serialization signature=\"serialization::archive\" version=\"3\">\
204     <a class_id=\"0\" tracking_level=\"0\">\
205         <b>1</b>\
206         <f>96953204</f>\
207         <g>177129195</g>\
208         <l>1</l>\
209         <m>5627</m>\
210         <n>23010</n>\
211         <o>7419</o>\
212         <p>16212</p>\
213         <q>4086</q>\
214         <r>2749</r>\
215         <c>-33</c>\
216         <s>124</s>\
217         <t>28</t>\
218         <u>32225</u>\
219         <v>17543</v>\
220         <w>0.84431422</w>\
221         <x>1.0170664757130923</x>\
222         <y>tjbx</y>\
223         <z>cuwjentqpkejp</z>\
224     </a>\
225     </boost_serialization>\
226     ";
227 
228     // Send the UCS4_data back out, converting to UTF-8
229     std::size_t l = std::wcslen(test3_data);
230     {
231         std::wofstream ofs;
232         ofs.imbue(utf8_locale);
233         ofs.open("test3.dat");
234         std::copy(
235             test3_data,
236             test3_data + l,
237             std::ostream_iterator<wchar_t, wchar_t>(ofs)
238         );
239     }
240 
241     // Make sure that both files are the same
242     {
243         std::wifstream ifs;
244         ifs.imbue(utf8_locale);
245         ifs.open("test3.dat");
246         ifs >> std::noskipws;
247         BOOST_TEST(
248             std::equal(
249                 test3_data,
250                 test3_data + l,
251                 std::istream_iterator<wchar_t, wchar_t>(ifs)
252             )
253         );
254     }
255 
256     // Test length calculation
257     {
258         std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
259         std::mbstate_t mbs = std::mbstate_t();
260         const int utf8_len = sizeof(td::utf8_encoding) / sizeof(*td::utf8_encoding);
261         int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + utf8_len), ~static_cast< std::size_t >(0u));
262         BOOST_TEST_EQ(utf8_len, res);
263     }
264 
265     // Test that length calculation detects character boundaries
266     {
267         std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
268         std::mbstate_t mbs = std::mbstate_t();
269         // The first 5 bytes of utf8_encoding contain 3 complete UTF-8 characters (taking 4 bytes in total) and 1 byte of an incomplete character.
270         // This last byte should not be accounted by length().
271         const int input_len = 5;
272         const int utf8_len = 4;
273         int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + input_len), ~static_cast< std::size_t >(0u));
274         BOOST_TEST_EQ(utf8_len, res);
275     }
276 
277     return EXIT_SUCCESS;
278 }
279 
280 int
main(int argc,char * argv[])281 main(int argc, char * argv[]){
282 
283     int retval = 1;
284     BOOST_TRY{
285         retval = test_main(argc, argv);
286     }
287     #ifndef BOOST_NO_EXCEPTION_STD_NAMESPACE
288         BOOST_CATCH(const std::exception & e){
289             BOOST_ERROR(e.what());
290         }
291     #endif
292     BOOST_CATCH(...){
293         BOOST_ERROR("failed with uncaught exception:");
294     }
295     BOOST_CATCH_END
296 
297     int error_count = boost::report_errors();
298     if(error_count > 0)
299         retval = error_count;
300     return retval;
301 }
302 
303