2 * Copyright (C) 2005-2013 Team XBMC
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with XBMC; see the file COPYING. If not, see
17 * <http://www.gnu.org/licenses/>.
21 #include "settings/Settings.h"
22 #include "utils/CharsetConverter.h"
23 #include "utils/StdString.h"
24 #include "utils/Utf8Utils.h"
27 #include "gtest/gtest.h"
29 static const uint16_t refutf16LE1[] = { 0xff54, 0xff45, 0xff53, 0xff54,
30 0xff3f, 0xff55, 0xff54, 0xff46,
31 0xff11, 0xff16, 0xff2c, 0xff25,
32 0xff54, 0xff4f, 0xff57, 0x0 };
34 static const uint16_t refutf16LE2[] = { 0xff54, 0xff45, 0xff53, 0xff54,
35 0xff3f, 0xff55, 0xff54, 0xff46,
36 0xff18, 0xff34, 0xff4f, 0xff1a,
37 0xff3f, 0xff43, 0xff48, 0xff41,
38 0xff52, 0xff53, 0xff45, 0xff54,
39 0xff3f, 0xff35, 0xff34, 0xff26,
40 0xff0d, 0xff11, 0xff16, 0xff2c,
41 0xff25, 0xff0c, 0xff3f, 0xff23,
42 0xff33, 0xff54, 0xff44, 0xff33,
43 0xff54, 0xff52, 0xff49, 0xff4e,
44 0xff47, 0xff11, 0xff16, 0x0 };
46 static const char refutf16LE3[] = "T\377E\377S\377T\377?\377S\377T\377"
47 "R\377I\377N\377G\377#\377H\377A\377"
48 "R\377S\377E\377T\377\064\377O\377\065"
49 "\377T\377F\377\030\377";
51 static const uint16_t refutf16LE4[] = { 0xff54, 0xff45, 0xff53, 0xff54,
52 0xff3f, 0xff55, 0xff54, 0xff46,
53 0xff11, 0xff16, 0xff2c, 0xff25,
54 0xff54, 0xff4f, 0xff35, 0xff34,
55 0xff26, 0xff18, 0x0 };
57 static const uint32_t refutf32LE1[] = { 0xff54, 0xff45, 0xff53, 0xff54,
58 0xff3f, 0xff55, 0xff54, 0xff46,
59 0xff18, 0xff34, 0xff4f, 0xff1a,
60 0xff3f, 0xff43, 0xff48, 0xff41,
61 0xff52, 0xff53, 0xff45, 0xff54,
62 0xff3f, 0xff35, 0xff34, 0xff26,
63 0xff0d, 0xff13, 0xff12, 0xff2c,
64 0xff25, 0xff0c, 0xff3f, 0xff23,
65 0xff33, 0xff54, 0xff44, 0xff33,
66 0xff54, 0xff52, 0xff49, 0xff4e,
67 0xff47, 0xff13, 0xff12, 0xff3f,
71 0x1f42d, 0x1f42e, 0x0 };
74 static const uint16_t refutf16BE[] = { 0x54ff, 0x45ff, 0x53ff, 0x54ff,
75 0x3fff, 0x55ff, 0x54ff, 0x46ff,
76 0x11ff, 0x16ff, 0x22ff, 0x25ff,
77 0x54ff, 0x4fff, 0x35ff, 0x34ff,
80 static const uint16_t refucs2[] = { 0xff54, 0xff45, 0xff53, 0xff54,
81 0xff3f, 0xff55, 0xff43, 0xff53,
82 0xff12, 0xff54, 0xff4f, 0xff35,
83 0xff34, 0xff26, 0xff18, 0x0 };
85 class TestCharsetConverter : public testing::Test
88 TestCharsetConverter()
90 /* Add default settings for locale.
91 * Settings here are taken from CGUISettings::Initialize()
94 CSettingsCategory *loc = CSettings::Get().AddCategory(7, "locale", 14090);
95 CSettings::Get().AddString(loc, "locale.language",248,"english",
97 CSettings::Get().AddString(loc, "locale.country", 20026, "USA",
99 CSettings::Get().AddString(loc, "locale.charset", 14091, "DEFAULT",
100 SPIN_CONTROL_TEXT); // charset is set by the
103 // Add default settings for subtitles
104 CSettingsCategory *sub = CSettings::Get().AddCategory(5, "subtitles", 287);
105 CSettings::Get().AddString(sub, "subtitles.charset", 735, "DEFAULT",
109 g_charsetConverter.reset();
110 g_charsetConverter.clear();
113 ~TestCharsetConverter()
115 CSettings::Get().Unload();
118 CStdStringA refstra1, refstra2, varstra1;
119 CStdStringW refstrw1, varstrw1;
120 CStdString16 refstr16_1, varstr16_1;
121 CStdString32 refstr32_1, varstr32_1;
125 TEST_F(TestCharsetConverter, utf8ToW)
127 refstra1 = "test utf8ToW";
128 refstrw1 = L"test utf8ToW";
130 g_charsetConverter.utf8ToW(refstra1, varstrw1, true, false, NULL);
131 EXPECT_STREQ(refstrw1.c_str(), varstrw1.c_str());
134 TEST_F(TestCharsetConverter, utf16LEtoW)
136 refstrw1 = L"test_utf16LEtow";
137 /* TODO: Should be able to use '=' operator instead of assign() */
138 refstr16_1.assign(refutf16LE1);
140 g_charsetConverter.utf16LEtoW(refstr16_1, varstrw1);
141 EXPECT_STREQ(refstrw1.c_str(), varstrw1.c_str());
144 TEST_F(TestCharsetConverter, subtitleCharsetToUtf8)
146 refstra1 = "test subtitleCharsetToW";
148 g_charsetConverter.subtitleCharsetToUtf8(refstra1, varstra1);
150 /* Assign refstra1 to refstrw1 so that we can compare */
151 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
154 TEST_F(TestCharsetConverter, utf8ToStringCharset_1)
156 refstra1 = "test utf8ToStringCharset";
158 g_charsetConverter.utf8ToStringCharset(refstra1, varstra1);
159 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
162 TEST_F(TestCharsetConverter, utf8ToStringCharset_2)
164 refstra1 = "test utf8ToStringCharset";
165 varstra1 = "test utf8ToStringCharset";
166 g_charsetConverter.utf8ToStringCharset(varstra1);
167 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
170 TEST_F(TestCharsetConverter, utf8ToSystem)
172 refstra1 = "test utf8ToSystem";
173 varstra1 = "test utf8ToSystem";
174 g_charsetConverter.utf8ToSystem(varstra1);
175 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
178 TEST_F(TestCharsetConverter, utf8To_ASCII)
180 refstra1 = "test utf8To: charset ASCII, CStdStringA";
182 g_charsetConverter.utf8To("ASCII", refstra1, varstra1);
183 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
186 TEST_F(TestCharsetConverter, utf8To_UTF16LE)
188 refstra1 = "test_utf8To:_charset_UTF-16LE,_"
190 refstr16_1.assign(refutf16LE2);
192 g_charsetConverter.utf8To("UTF-16LE", refstra1, varstr16_1);
193 EXPECT_TRUE(!memcmp(refstr16_1.c_str(), varstr16_1.c_str(),
194 refstr16_1.length() * sizeof(uint16_t)));
197 TEST_F(TestCharsetConverter, utf8To_UTF32LE)
199 refstra1 = "test_utf8To:_charset_UTF-32LE,_"
201 /* OSX has it's own 'special' utf-8 charset which we use (see UTF8_SOURCE in CharsetConverter.cpp)
202 which is basically NFD (decomposed) utf-8. The trouble is, it fails on the COW FACE and MOUSE FACE
203 characters for some reason (possibly anything over 0x100000, or maybe there's a decomposed form of these
204 that I couldn't find???) If UTF8_SOURCE is switched to UTF-8 then this test would pass as-is, but then
205 some filenames stored in utf8-mac wouldn't display correctly in the UI. */
210 refstr32_1.assign(refutf32LE1);
212 g_charsetConverter.utf8To("UTF-32LE", refstra1, varstr32_1);
213 EXPECT_TRUE(!memcmp(refstr32_1.c_str(), varstr32_1.c_str(),
214 sizeof(refutf32LE1)));
217 TEST_F(TestCharsetConverter, stringCharsetToUtf8)
219 refstra1 = "test_stringCharsetToUtf8";
221 g_charsetConverter.ToUtf8("UTF-16LE", refutf16LE3, varstra1);
222 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
225 TEST_F(TestCharsetConverter, isValidUtf8_1)
228 g_charsetConverter.ToUtf8("UTF-16LE", refutf16LE3, varstra1);
229 EXPECT_TRUE(CUtf8Utils::isValidUtf8(varstra1.c_str()));
232 TEST_F(TestCharsetConverter, isValidUtf8_2)
234 refstr1 = refutf16LE3;
235 EXPECT_FALSE(CUtf8Utils::isValidUtf8(refstr1));
238 TEST_F(TestCharsetConverter, isValidUtf8_3)
241 g_charsetConverter.ToUtf8("UTF-16LE", refutf16LE3, varstra1);
242 EXPECT_TRUE(CUtf8Utils::isValidUtf8(varstra1.c_str()));
245 TEST_F(TestCharsetConverter, isValidUtf8_4)
247 EXPECT_FALSE(CUtf8Utils::isValidUtf8(refutf16LE3));
250 /* TODO: Resolve correct input/output for this function */
251 // TEST_F(TestCharsetConverter, ucs2CharsetToStringCharset)
253 // void ucs2CharsetToStringCharset(const CStdStringW& strSource,
254 // CStdStringA& strDest, bool swap = false);
257 TEST_F(TestCharsetConverter, wToUTF8)
259 refstrw1 = L"test_wToUTF8";
260 refstra1 = "test_wToUTF8";
262 g_charsetConverter.wToUTF8(refstrw1, varstra1);
263 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
266 TEST_F(TestCharsetConverter, utf16BEtoUTF8)
268 refstr16_1.assign(refutf16BE);
269 refstra1 = "test_utf16BEtoUTF8";
271 g_charsetConverter.utf16BEtoUTF8(refstr16_1, varstra1);
272 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
275 TEST_F(TestCharsetConverter, utf16LEtoUTF8)
277 refstr16_1.assign(refutf16LE4);
278 refstra1 = "test_utf16LEtoUTF8";
280 g_charsetConverter.utf16LEtoUTF8(refstr16_1, varstra1);
281 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
284 TEST_F(TestCharsetConverter, ucs2ToUTF8)
286 refstr16_1.assign(refucs2);
287 refstra1 = "test_ucs2toUTF8";
289 g_charsetConverter.ucs2ToUTF8(refstr16_1, varstra1);
290 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
293 TEST_F(TestCharsetConverter, utf8logicalToVisualBiDi)
295 refstra1 = "test_utf8logicalToVisualBiDi";
296 refstra2 = "test_utf8logicalToVisualBiDi";
298 g_charsetConverter.utf8logicalToVisualBiDi(refstra1, varstra1);
299 EXPECT_STREQ(refstra2.c_str(), varstra1.c_str());
302 /* TODO: Resolve correct input/output for this function */
303 // TEST_F(TestCharsetConverter, utf32ToStringCharset)
305 // void utf32ToStringCharset(const unsigned long* strSource, CStdStringA& strDest);
308 TEST_F(TestCharsetConverter, getCharsetLabels)
310 std::vector<CStdString> reflabels;
311 reflabels.push_back("Western Europe (ISO)");
312 reflabels.push_back("Central Europe (ISO)");
313 reflabels.push_back("South Europe (ISO)");
314 reflabels.push_back("Baltic (ISO)");
315 reflabels.push_back("Cyrillic (ISO)");
316 reflabels.push_back("Arabic (ISO)");
317 reflabels.push_back("Greek (ISO)");
318 reflabels.push_back("Hebrew (ISO)");
319 reflabels.push_back("Turkish (ISO)");
320 reflabels.push_back("Central Europe (Windows)");
321 reflabels.push_back("Cyrillic (Windows)");
322 reflabels.push_back("Western Europe (Windows)");
323 reflabels.push_back("Greek (Windows)");
324 reflabels.push_back("Turkish (Windows)");
325 reflabels.push_back("Hebrew (Windows)");
326 reflabels.push_back("Arabic (Windows)");
327 reflabels.push_back("Baltic (Windows)");
328 reflabels.push_back("Vietnamesse (Windows)");
329 reflabels.push_back("Thai (Windows)");
330 reflabels.push_back("Chinese Traditional (Big5)");
331 reflabels.push_back("Chinese Simplified (GBK)");
332 reflabels.push_back("Japanese (Shift-JIS)");
333 reflabels.push_back("Korean");
334 reflabels.push_back("Hong Kong (Big5-HKSCS)");
336 std::vector<std::string> varlabels = g_charsetConverter.getCharsetLabels();
337 ASSERT_EQ(reflabels.size(), varlabels.size());
339 std::vector<std::string>::iterator it;
340 for (it = varlabels.begin(); it < varlabels.end(); it++)
342 EXPECT_STREQ((reflabels.at(it - varlabels.begin())).c_str(), (*it).c_str());
346 TEST_F(TestCharsetConverter, getCharsetLabelByName)
349 g_charsetConverter.getCharsetLabelByName("ISO-8859-1");
350 EXPECT_STREQ("Western Europe (ISO)", varstr.c_str());
352 varstr = g_charsetConverter.getCharsetLabelByName("Bogus");
353 EXPECT_STREQ("", varstr.c_str());
356 TEST_F(TestCharsetConverter, getCharsetNameByLabel)
359 g_charsetConverter.getCharsetNameByLabel("Western Europe (ISO)");
360 EXPECT_STREQ("ISO-8859-1", varstr.c_str());
362 varstr = g_charsetConverter.getCharsetNameByLabel("Bogus");
363 EXPECT_STREQ("", varstr.c_str());
366 TEST_F(TestCharsetConverter, unknownToUTF8_1)
368 refstra1 = "test_unknownToUTF8";
369 varstra1 = "test_unknownToUTF8";
370 g_charsetConverter.unknownToUTF8(varstra1);
371 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
374 TEST_F(TestCharsetConverter, unknownToUTF8_2)
376 refstra1 = "test_unknownToUTF8";
378 g_charsetConverter.unknownToUTF8(refstra1, varstra1);
379 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());
382 TEST_F(TestCharsetConverter, toW)
384 refstra1 = "test_toW:_charset_UTF-16LE";
385 refstrw1 = L"\xBDEF\xEF94\x85BD\xBDEF\xEF93\x94BD\xBCEF\xEFBF"
386 L"\x94BD\xBDEF\xEF8F\xB7BC\xBCEF\xEF9A\xBFBC\xBDEF"
387 L"\xEF83\x88BD\xBDEF\xEF81\x92BD\xBDEF\xEF93\x85BD"
388 L"\xBDEF\xEF94\xBFBC\xBCEF\xEFB5\xB4BC\xBCEF\xEFA6"
389 L"\x8DBC\xBCEF\xEF91\x96BC\xBCEF\xEFAC\xA5BC";
391 g_charsetConverter.toW(refstra1, varstrw1, "UTF-16LE");
392 EXPECT_STREQ(refstrw1.c_str(), varstrw1.c_str());
395 TEST_F(TestCharsetConverter, fromW)
397 refstrw1 = L"\xBDEF\xEF94\x85BD\xBDEF\xEF93\x94BD\xBCEF\xEFBF"
398 L"\x86BD\xBDEF\xEF92\x8FBD\xBDEF\xEF8D\xB7BC\xBCEF"
399 L"\xEF9A\xBFBC\xBDEF\xEF83\x88BD\xBDEF\xEF81\x92BD"
400 L"\xBDEF\xEF93\x85BD\xBDEF\xEF94\xBFBC\xBCEF\xEFB5"
401 L"\xB4BC\xBCEF\xEFA6\x8DBC\xBCEF\xEF91\x96BC\xBCEF"
403 refstra1 = "test_fromW:_charset_UTF-16LE";
405 g_charsetConverter.fromW(refstrw1, varstra1, "UTF-16LE");
406 EXPECT_STREQ(refstra1.c_str(), varstra1.c_str());