7CC30DB116291A5C003E7579 /* MusicThumbLoader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CC30DAF16291A5C003E7579 /* MusicThumbLoader.cpp */; };
7CC30DC016291C2C003E7579 /* VideoThumbLoader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CC30DBE16291C2C003E7579 /* VideoThumbLoader.cpp */; };
7CC30E8A16296078003E7579 /* EdenVideoArtUpdater.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CC30E8816296078003E7579 /* EdenVideoArtUpdater.cpp */; };
+ 7CC82C9318284F9F0010DF30 /* CharsetDetection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CC82C9118284F9F0010DF30 /* CharsetDetection.cpp */; };
+ 7CC82C9418284F9F0010DF30 /* CharsetDetection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CC82C9118284F9F0010DF30 /* CharsetDetection.cpp */; };
+ 7CC82C9518284F9F0010DF30 /* CharsetDetection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CC82C9118284F9F0010DF30 /* CharsetDetection.cpp */; };
7CCF7F1D1069F3AE00992676 /* Builtins.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CCF7F1B1069F3AE00992676 /* Builtins.cpp */; };
7CCF7FC9106A0DF500992676 /* TimeUtils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CCF7FC7106A0DF500992676 /* TimeUtils.cpp */; };
7CCFD98D151494E100211D82 /* PCMCodec.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7CCFD98A151494E100211D82 /* PCMCodec.cpp */; };
7CC30DBF16291C2C003E7579 /* VideoThumbLoader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = VideoThumbLoader.h; sourceTree = "<group>"; };
7CC30E8816296078003E7579 /* EdenVideoArtUpdater.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = EdenVideoArtUpdater.cpp; sourceTree = "<group>"; };
7CC30E8916296078003E7579 /* EdenVideoArtUpdater.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = EdenVideoArtUpdater.h; sourceTree = "<group>"; };
+ 7CC82C9118284F9F0010DF30 /* CharsetDetection.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CharsetDetection.cpp; sourceTree = "<group>"; };
+ 7CC82C9218284F9F0010DF30 /* CharsetDetection.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CharsetDetection.h; sourceTree = "<group>"; };
7CCF7F1B1069F3AE00992676 /* Builtins.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Builtins.cpp; sourceTree = "<group>"; };
7CCF7F1C1069F3AE00992676 /* Builtins.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Builtins.h; sourceTree = "<group>"; };
7CCF7FC7106A0DF500992676 /* TimeUtils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = TimeUtils.cpp; sourceTree = "<group>"; };
DFECFB1B172D9D0100A43CF7 /* BooleanLogic.h */,
E38E1E290D25F9FD00618676 /* CharsetConverter.cpp */,
E38E1E2A0D25F9FD00618676 /* CharsetConverter.h */,
+ 7CC82C9118284F9F0010DF30 /* CharsetDetection.cpp */,
+ 7CC82C9218284F9F0010DF30 /* CharsetDetection.h */,
E38E1E2B0D25F9FD00618676 /* CPUInfo.cpp */,
E38E1E2C0D25F9FD00618676 /* CPUInfo.h */,
18B7C8E712942603009E7A26 /* Crc32.cpp */,
DFEF0BAC180ADE6400AEAED1 /* FileItemListModification.cpp in Sources */,
DFEF0BC1180ADEDA00AEAED1 /* SmartPlaylistFileItemListModifier.cpp in Sources */,
7C2612671820667C0086E04D /* ISettingControl.cpp in Sources */,
+ 7CC82C9318284F9F0010DF30 /* CharsetDetection.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
DFEF0BAE180ADE6400AEAED1 /* FileItemListModification.cpp in Sources */,
DFEF0BC3180ADEDA00AEAED1 /* SmartPlaylistFileItemListModifier.cpp in Sources */,
7C2612691820667C0086E04D /* ISettingControl.cpp in Sources */,
+ 7CC82C9518284F9F0010DF30 /* CharsetDetection.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
DFEF0BAD180ADE6400AEAED1 /* FileItemListModification.cpp in Sources */,
DFEF0BC2180ADEDA00AEAED1 /* SmartPlaylistFileItemListModifier.cpp in Sources */,
7C2612681820667C0086E04D /* ISettingControl.cpp in Sources */,
+ 7CC82C9418284F9F0010DF30 /* CharsetDetection.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
<ClInclude Include="..\..\xbmc\settings\windows\GUIWindowTestPattern.h" />
<ClInclude Include="..\..\xbmc\utils\ActorProtocol.h" />
<ClInclude Include="..\..\xbmc\utils\BooleanLogic.h" />
+ <ClInclude Include="..\..\xbmc\utils\CharsetDetection.h" />
<ClInclude Include="..\..\xbmc\utils\IRssObserver.h" />
<ClInclude Include="..\..\xbmc\utils\IXmlDeserializable.h" />
<ClInclude Include="..\..\xbmc\utils\LegacyPathTranslation.h" />
<ClCompile Include="..\..\xbmc\ThumbLoader.cpp" />
<ClCompile Include="..\..\xbmc\utils\ActorProtocol.cpp" />
<ClCompile Include="..\..\xbmc\utils\BooleanLogic.cpp" />
+ <ClCompile Include="..\..\xbmc\utils\CharsetDetection.cpp" />
<ClCompile Include="..\..\xbmc\utils\LegacyPathTranslation.cpp" />
<ClCompile Include="..\..\xbmc\utils\RssManager.cpp" />
<ClCompile Include="..\..\xbmc\utils\StringValidation.cpp" />
<ClCompile Include="..\..\xbmc\settings\ISettingControl.cpp">
<Filter>settings</Filter>
</ClCompile>
+ <ClCompile Include="..\..\xbmc\utils\CharsetDetection.cpp">
+ <Filter>utils</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\xbmc\win32\pch.h">
<ClInclude Include="..\..\xbmc\settings\SettingDefinitions.h">
<Filter>settings</Filter>
</ClInclude>
+ <ClInclude Include="..\..\xbmc\utils\CharsetDetection.h">
+ <Filter>utils</Filter>
+ </ClInclude>
</ItemGroup>
<ItemGroup>
<ResourceCompile Include="..\..\xbmc\win32\XBMC_PC.rc">
else
strDoc = m_headofdoc;
- CStdString encoding;
- XMLUtils::GetEncoding(&doc, encoding);
-
- CStdString strUtf8(strDoc);
- if (encoding.IsEmpty())
- g_charsetConverter.unknownToUTF8(strUtf8);
- else
- g_charsetConverter.ToUtf8(encoding, strDoc, strUtf8);
-
- doc.Clear();
- doc.Parse(strUtf8, TIXML_ENCODING_UTF8);
+ doc.Parse(strDoc, TIXML_ENCODING_UNKNOWN);
return details.Load(doc.RootElement(), true, prioritise);
}
CLog::Log(LOGDEBUG,"scraper: %s returned %s",function.c_str(),strXML.c_str());
- if (!XMLUtils::HasUTF8Declaration(strXML))
- g_charsetConverter.unknownToUTF8(strXML);
-
CXBMCTinyXML doc;
- doc.Parse(strXML, TIXML_ENCODING_UTF8);
+ doc.Parse(strXML, TIXML_ENCODING_UNKNOWN);
if (!doc.RootElement())
{
CLog::Log(LOGERROR, "%s: Unable to parse XML",__FUNCTION__);
return false;
}
- XMLUtils::GetEncoding(&xmlDoc, encoding);
-
TiXmlElement* pRootElement = xmlDoc.RootElement();
if (!pRootElement || pRootElement->NoChildren() ||
pRootElement->ValueStr()!=CStdString("strings"))
{
uint32_t id = atoi(attrId) + offset;
if (m_strings.find(id) == m_strings.end())
- m_strings[id].strTranslated = ToUTF8(encoding, pChild->FirstChild()->Value());
+ m_strings[id].strTranslated = pChild->FirstChild()->Value();
}
pChild = pChild->NextSiblingElement("string");
}
if (root == NULL)
return false;
- CStdString encoding;
- XMLUtils::GetEncoding(&m_xmlDoc, encoding);
-
- return LoadFromXML(root, encoding);
+ return LoadFromXML(root);
}
bool CSmartPlaylist::Load(const CStdString &path)
--- /dev/null
+/*
+* Copyright (C) 2013 Team XBMC
+* http://xbmc.org
+*
+* This Program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This Program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with XBMC; see the file COPYING. If not, see
+* <http://www.gnu.org/licenses/>.
+*
+*/
+
+#include <algorithm>
+#include "CharsetDetection.h"
+#include "utils/CharsetConverter.h"
+#include "utils/StringUtils.h"
+
+/* XML declaration can be virtually any size (with many-many whitespaces)
+ * but for in real world we don't need to process megabytes of data
+ * so limit search for XML declaration to reasonable value */
+const size_t CCharsetDetection::m_XmlDeclarationMaxLength = 250;
+
+
+std::string CCharsetDetection::GetBomEncoding(const char* const content, const size_t contentLength)
+{
+ if (contentLength < 2)
+ return "";
+ if (content[0] == (char)0xFE && content[1] == (char)0xFF)
+ return "UTF-16BE";
+ if (contentLength >= 4 && content[0] == (char)0xFF && content[1] == (char)0xFE && content[2] == (char)0x00 && content[3] == (char)0x00)
+ return "UTF-32LE"; /* first two bytes are same for UTF-16LE and UTF-32LE, so first check for full UTF-32LE mark */
+ if (content[0] == (char)0xFF && content[1] == (char)0xFE)
+ return "UTF-16LE";
+ if (contentLength < 3)
+ return "";
+ if (content[0] == (char)0xEF && content[1] == (char)0xBB && content[2] == (char)0xBF)
+ return "UTF-8";
+ if (contentLength < 4)
+ return "";
+ if (content[0] == (char)0x00 && content[1] == (char)0x00 && content[2] == (char)0xFE && content[3] == (char)0xFF)
+ return "UTF-32BE";
+ if (contentLength >= 5 && content[0] == (char)0x2B && content[1] == (char)0x2F && content[2] == (char)0x76 &&
+ (content[4] == (char)0x32 || content[4] == (char)0x39 || content[4] == (char)0x2B || content[4] == (char)0x2F))
+ return "UTF-7";
+ if (content[0] == (char)0x84 && content[1] == (char)0x31 && content[2] == (char)0x95 && content[3] == (char)0x33)
+ return "GB18030";
+
+ return "";
+}
+
+bool CCharsetDetection::DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding)
+{
+ detectedEncoding.clear();
+
+ if (contentLength < 2)
+ return false; // too short for any detection
+
+ /* Byte Order Mark has priority over "encoding=" parameter */
+ detectedEncoding = GetBomEncoding(xmlContent, contentLength);
+ if (!detectedEncoding.empty())
+ return true;
+
+ /* try to read encoding from XML declaration */
+ if (GetXmlEncodingFromDeclaration(xmlContent, contentLength, detectedEncoding))
+ {
+ StringUtils::ToUpper(detectedEncoding);
+
+ /* make some safety checks */
+ if (detectedEncoding == "UTF-8")
+ return true; // fast track for most common case
+
+ if (StringUtils::StartsWith(detectedEncoding, "UCS-") || StringUtils::StartsWith(detectedEncoding, "UTF-"))
+ {
+ if (detectedEncoding == "UTF-7")
+ return true;
+
+ /* XML declaration was detected in UTF-8 mode (by 'GetXmlEncodingFromDeclaration') so we know
+ * that text in single byte encoding, but declaration itself wrongly specify multibyte encoding */
+ detectedEncoding.clear();
+ return false;
+ }
+ return true;
+ }
+
+ /* try to detect basic encoding */
+ std::string guessedEncoding;
+ if (!GuessXmlEncoding(xmlContent, contentLength, guessedEncoding))
+ return false; /* can't detect any encoding */
+
+ /* have some guessed encoding, try to use it */
+ std::string convertedXml;
+ /* use 'm_XmlDeclarationMaxLength * 4' below for UTF-32-like encodings */
+ if (!g_charsetConverter.ToUtf8(guessedEncoding, std::string(xmlContent, std::min(contentLength, m_XmlDeclarationMaxLength * 4)), convertedXml)
+ || convertedXml.empty())
+ return false; /* can't convert, guessed encoding is wrong */
+
+ /* text converted, hopefully at least XML declaration is in UTF-8 now */
+ std::string declaredEncoding;
+ /* try to read real encoding from converted XML declaration */
+ if (!GetXmlEncodingFromDeclaration(convertedXml.c_str(), convertedXml.length(), declaredEncoding))
+ { /* did not find real encoding in XML declaration, use guessed encoding */
+ detectedEncoding = guessedEncoding;
+ return true;
+ }
+
+ /* found encoding in converted XML declaration, we know correct endianness and number of bytes per char */
+ /* make some safety checks */
+ StringUtils::ToUpper(declaredEncoding);
+ if (declaredEncoding == guessedEncoding)
+ return true;
+
+ if (StringUtils::StartsWith(guessedEncoding, "UCS-4"))
+ {
+ if (declaredEncoding.length() < 5 ||
+ (!StringUtils::StartsWith(declaredEncoding, "UTF-32") && !StringUtils::StartsWith(declaredEncoding, "UCS-4")))
+ { /* Guessed encoding was correct because we can convert and read XML declaration, but declaration itself is wrong (not 4-bytes encoding) */
+ detectedEncoding = guessedEncoding;
+ return true;
+ }
+ }
+ else if (StringUtils::StartsWith(guessedEncoding, "UTF-16"))
+ {
+ if (declaredEncoding.length() < 5 ||
+ (!StringUtils::StartsWith(declaredEncoding, "UTF-16") && !StringUtils::StartsWith(declaredEncoding, "UCS-2")))
+ { /* Guessed encoding was correct because we can read XML declaration, but declaration is wrong (not 2-bytes encoding) */
+ detectedEncoding = guessedEncoding;
+ return true;
+ }
+ }
+
+ if (StringUtils::StartsWith(guessedEncoding, "UCS-4") || StringUtils::StartsWith(guessedEncoding, "UTF-16"))
+ {
+ /* Check endianness in declared encoding. We already know correct endianness as XML declaration was detected after conversion. */
+ /* Guessed UTF/UCS encoding always ends with endianness */
+ std::string guessedEndianness(guessedEncoding, guessedEncoding.length() - 2);
+
+ if (!StringUtils::EndsWith(declaredEncoding, "BE") && !StringUtils::EndsWith(declaredEncoding, "LE")) /* Declared encoding without endianness */
+ detectedEncoding = declaredEncoding + guessedEndianness; /* add guessed endianness */
+ else if (!StringUtils::EndsWith(declaredEncoding, guessedEndianness)) /* Wrong endianness in declared encoding */
+ detectedEncoding = declaredEncoding.substr(0, declaredEncoding.length() - 2) + guessedEndianness; /* replace endianness by guessed endianness */
+ else
+ detectedEncoding = declaredEncoding; /* declared encoding with correct endianness */
+
+ return true;
+ }
+ else if (StringUtils::StartsWith(guessedEncoding, "EBCDIC"))
+ {
+ if (declaredEncoding.find("EBCDIC") != std::string::npos)
+ detectedEncoding = declaredEncoding; /* Declared encoding is some specific EBCDIC encoding */
+ else
+ detectedEncoding = guessedEncoding;
+
+ return true;
+ }
+
+ /* should be unreachable */
+ return false;
+}
+
+bool CCharsetDetection::GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding)
+{
+ // following code is std::string-processing analog of regular expression-processing
+ // regular expression: "<\\?xml([ \n\r\t]+[^ \n\t\r>]+)*[ \n\r\t]+encoding[ \n\r\t]*=[ \n\r\t]*('[^ \n\t\r>']+'|\"[^ \n\t\r>\"]+\")"
+ // on win32 x86 machine regular expression is slower that std::string 20-40 times and can slowdown XML processing for several times
+ // seems that this regular expression is too slow due to many variable length parts, regexp for '&'-fixing is much faster
+
+ declaredEncoding.clear();
+
+ // avoid extra large search
+ std::string strXml(xmlContent, std::min(contentLength, m_XmlDeclarationMaxLength));
+
+ size_t pos = strXml.find("<?xml");
+ if (pos == std::string::npos || pos + 6 > strXml.length() || pos > strXml.find('<'))
+ return false; // no "<?xml" declaration, "<?xml" is not first element or "<?xml" is incomplete
+
+ pos += 5; // 5 is length of "<?xml"
+
+ const size_t declLength = std::min(std::min(m_XmlDeclarationMaxLength, contentLength - pos), strXml.find('>', pos) - pos);
+ const std::string xmlDecl(xmlContent + pos, declLength);
+ const char* const xmlDeclC = xmlDecl.c_str(); // for faster processing of [] and for null-termination
+
+ static const char* const whiteSpaceChars = " \n\r\t"; // according to W3C Recommendation for XML, any of them can be used as separator
+ pos = 0;
+
+ while (pos + 12 <= declLength) // 12 is minimal length of "encoding='x'"
+ {
+ pos = xmlDecl.find_first_of(whiteSpaceChars, pos);
+ if (pos == std::string::npos)
+ return false; // no " encoding=" in declaration
+
+ pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
+ if (pos == std::string::npos)
+ return false; // no "encoding=" in declaration
+
+ if (xmlDecl.compare(pos, 8, "encoding", 8) != 0)
+ continue; // not "encoding" parameter
+ pos += 8; // length of "encoding"
+
+ if (xmlDeclC[pos] == ' ' || xmlDeclC[pos] == '\n' || xmlDeclC[pos] == '\r' || xmlDeclC[pos] == '\t') // no buffer overrun as string is null-terminated
+ {
+ pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
+ if (pos == std::string::npos)
+ return false; // this " encoding" is incomplete, only whitespace chars remains
+ }
+ if (xmlDeclC[pos] != '=')
+ { // "encoding" without "=", try to find other
+ pos--; // step back to whitespace
+ continue;
+ }
+
+ pos++; // skip '='
+ if (xmlDeclC[pos] == ' ' || xmlDeclC[pos] == '\n' || xmlDeclC[pos] == '\r' || xmlDeclC[pos] == '\t') // no buffer overrun as string is null-terminated
+ {
+ pos = xmlDecl.find_first_not_of(whiteSpaceChars, pos);
+ if (pos == std::string::npos)
+ return false; // this " encoding" is incomplete, only whitespace chars remains
+ }
+ size_t encNameEndPos;
+ if (xmlDeclC[pos] == '"')
+ encNameEndPos = xmlDecl.find('"', ++pos);
+ else if (xmlDeclC[pos] == '\'')
+ encNameEndPos = xmlDecl.find('\'', ++pos);
+ else
+ continue; // no quote or double quote after 'encoding=', try to find other
+
+ if (encNameEndPos != std::string::npos)
+ {
+ declaredEncoding.assign(xmlDecl, pos, encNameEndPos - pos);
+ return true;
+ }
+ // no closing quote or double quote after 'encoding="x', try to find other
+ }
+
+ return false;
+}
+
+bool CCharsetDetection::GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding)
+{
+ supposedEncoding.clear();
+ if (contentLength < 4)
+ return false; // too little data to guess
+
+ if (xmlContent[0] == 0 && xmlContent[1] == 0 && xmlContent[2] == 0 && xmlContent[3] == (char)0x3C) // '<' == '00 00 00 3C' in UCS-4 (UTF-32) big-endian
+ supposedEncoding = "UCS-4BE"; // use UCS-4 according to W3C recommendation
+ else if (xmlContent[0] == (char)0x3C && xmlContent[1] == 0 && xmlContent[2] == 0 && xmlContent[3] == 0) // '<' == '3C 00 00 00' in UCS-4 (UTF-32) little-endian
+ supposedEncoding = "UCS-4LE"; // use UCS-4 according to W3C recommendation
+ else if (xmlContent[0] == 0 && xmlContent[1] == (char)0x3C && xmlContent[2] == 0 && xmlContent[3] == (char)0x3F) // "<?" == "00 3C 00 3F" in UTF-16 (UCS-2) big-endian
+ supposedEncoding = "UTF-16BE";
+ else if (xmlContent[0] == (char)0x3C && xmlContent[1] == 0 && xmlContent[2] == (char)0x3F && xmlContent[3] == 0) // "<?" == "3C 00 3F 00" in UTF-16 (UCS-2) little-endian
+ supposedEncoding = "UTF-16LE";
+ else if (xmlContent[0] == (char)0x4C && xmlContent[1] == (char)0x6F && xmlContent[2] == (char)0xA7 && xmlContent[3] == (char)0x94) // "<?xm" == "4C 6F A7 94" in most EBCDIC encodings
+ supposedEncoding = "EBCDIC-CP-US"; // guessed value, real value must be read from declaration
+ else
+ return false;
+
+ return true;
+}
+
--- /dev/null
+#pragma once
+
+/*
+* Copyright (C) 2013 Team XBMC
+* http://xbmc.org
+*
+* This Program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2, or (at your option)
+* any later version.
+*
+* This Program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with XBMC; see the file COPYING. If not, see
+* <http://www.gnu.org/licenses/>.
+*
+*/
+
+#include <string>
+
+
+class CCharsetDetection
+{
+public:
+ /**
+ * Detect text encoding by Byte Order Mark
+ * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
+ * @param content pointer to text to analyze
+ * @param contentLength length of text
+ * @return detected encoding or empty string if BOM not detected
+ */
+ static std::string GetBomEncoding(const char* const content, const size_t contentLength);
+ /**
+ * Detect text encoding by Byte Order Mark
+ * Multibyte encodings (UTF-16/32) always ends with explicit endianness (LE/BE)
+ * @param content the text to analyze
+ * @return detected encoding or empty string if BOM not detected
+ */
+ static inline std::string GetBomEncoding(const std::string& content, std::string& detectedEncoding)
+ { return GetBomEncoding(content.c_str(), content.length()); }
+
+ static inline bool DetectXmlEncoding(const std::string& xmlContent, std::string& detectedEncoding)
+ { return DetectXmlEncoding(xmlContent.c_str(), xmlContent.length(), detectedEncoding); }
+
+ static bool DetectXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& detectedEncoding);
+
+private:
+ static bool GetXmlEncodingFromDeclaration(const char* const xmlContent, const size_t contentLength, std::string& declaredEncoding);
+ /**
+ * Try to guess text encoding by searching for '<?xml' mark in different encodings
+ * Multibyte encodings (UTF/UCS) always ends with explicit endianness (LE/BE)
+ * @param content pointer to text to analyze
+ * @param contentLength length of text
+ * @param detectedEncoding reference to variable that receive supposed encoding
+ * @return true if any encoding supposed, false otherwise
+ */
+ static bool GuessXmlEncoding(const char* const xmlContent, const size_t contentLength, std::string& supposedEncoding);
+
+ static const size_t m_XmlDeclarationMaxLength;
+};
SRCS += BitstreamStats.cpp
SRCS += BooleanLogic.cpp
SRCS += CharsetConverter.cpp
+SRCS += CharsetDetection.cpp
SRCS += CPUInfo.cpp
SRCS += Crc32.cpp
SRCS += CryptThreading.cpp
iStart = strXML.Find("<content:encoded>");
}
+ // TODO: Use server reported charset
if (Parse((LPSTR)strXML.c_str(), iFeed))
CLog::Log(LOGDEBUG, "Parsed rss feed: %s", strUrl.c_str());
}
bool CRssReader::Parse(LPSTR szBuffer, int iFeed)
{
m_xml.Clear();
- m_xml.Parse((LPCSTR)szBuffer, TIXML_ENCODING_LEGACY);
+ m_xml.Parse((LPCSTR)szBuffer);
+ m_encoding = "UTF-8"; // TODO: remove member variable
- m_encoding = "UTF-8";
- if (m_xml.RootElement())
- {
- TiXmlDeclaration *tiXmlDeclaration = m_xml.RootElement()->Parent()->FirstChild()->ToDeclaration();
- if (tiXmlDeclaration != NULL && strlen(tiXmlDeclaration->Encoding()) > 0)
- m_encoding = tiXmlDeclaration->Encoding();
- }
-
- CLog::Log(LOGDEBUG, "RSS feed encoding: %s", m_encoding.c_str());
+ CLog::Log(LOGDEBUG, "RSS feed encoding: %s", m_xml.GetUsedCharset().c_str());
return Parse(iFeed);
}
return false;
// ok, now parse the xml file
- if (!XMLUtils::HasUTF8Declaration(strUrl))
- g_charsetConverter.unknownToUTF8(strUrl);
-
CXBMCTinyXML doc;
- doc.Parse(strUrl, TIXML_ENCODING_UTF8);
+ doc.Parse(strUrl, TIXML_ENCODING_UNKNOWN);
TiXmlElement* pElement = doc.RootElement();
if (!pElement)
return false;
// ok, now parse the xml file
- if (!XMLUtils::HasUTF8Declaration(strUrls))
- g_charsetConverter.unknownToUTF8(strUrls);
-
CXBMCTinyXML doc;
- doc.Parse(strUrls, TIXML_ENCODING_UTF8);
+ doc.Parse(strUrls, TIXML_ENCODING_UNKNOWN);
if (doc.RootElement())
{
TiXmlHandle docHandle( &doc );
return;
}
- CStdString strEncoding;
- XMLUtils::GetEncoding(&xmlDoc, strEncoding);
-
TiXmlElement* pRootElement = xmlDoc.RootElement();
if (pRootElement->Value() != CStdString("strings"))
return;
(LOCALIZED_TOKEN_FIRSTID3 <= id && id <= LOCALIZED_TOKEN_LASTID3) ||
(LOCALIZED_TOKEN_FIRSTID4 <= id && id <= LOCALIZED_TOKEN_LASTID4))
{
- CStdString utf8Label;
- if (strEncoding.IsEmpty()) // Is language file utf8?
- utf8Label=pChild->FirstChild()->Value();
- else
- g_charsetConverter.ToUtf8(strEncoding, pChild->FirstChild()->Value(), utf8Label);
-
+ CStdString utf8Label(pChild->FirstChild()->Value());
if (!utf8Label.IsEmpty())
m_localizedTokens.insert(make_pair(utf8Label, id));
}
#include "XBMCTinyXML.h"
#include "filesystem/File.h"
#include "utils/FileUtils.h"
+#include "utils/StringUtils.h"
+#include "utils/CharsetConverter.h"
+#include "utils/CharsetDetection.h"
+#include "LangInfo.h"
#include "RegExp.h"
+#include "utils/log.h"
#define MAX_ENTITY_LENGTH 8 // size of largest entity "&#xNNNN;"
#define BUFFER_SIZE 4096
{
}
+CXBMCTinyXML::CXBMCTinyXML(const std::string& documentName, const std::string& documentCharset)
+: TiXmlDocument(documentName), m_SuggestedCharset(documentCharset)
+{
+ StringUtils::ToUpper(m_SuggestedCharset);
+}
+
bool CXBMCTinyXML::LoadFile(TiXmlEncoding encoding)
{
return LoadFile(value, encoding);
return true;
}
+bool CXBMCTinyXML::LoadFile(const std::string& _filename, const std::string& documentCharset)
+{
+ m_SuggestedCharset = documentCharset;
+ StringUtils::ToUpper(m_SuggestedCharset);
+ return LoadFile(_filename, TIXML_ENCODING_UNKNOWN);
+}
+
bool CXBMCTinyXML::LoadFile(FILE *f, TiXmlEncoding encoding)
{
std::string data;
return false;
}
-const char *CXBMCTinyXML::Parse(const char *_data, TiXmlEncoding encoding)
+bool CXBMCTinyXML::Parse(const char *_data, TiXmlEncoding encoding)
{
return Parse(std::string(_data), encoding);
}
-const char *CXBMCTinyXML::Parse(const std::string& rawdata, TiXmlEncoding encoding)
+bool CXBMCTinyXML::Parse(const std::string& data, const std::string& dataCharset)
+{
+ m_SuggestedCharset = dataCharset;
+ StringUtils::ToUpper(m_SuggestedCharset);
+ return Parse(data, TIXML_ENCODING_UNKNOWN);
+}
+
+bool CXBMCTinyXML::Parse(const std::string& data, TiXmlEncoding encoding /*= TIXML_DEFAULT_ENCODING */)
+{
+ m_UsedCharset.clear();
+ if (encoding != TIXML_ENCODING_UNKNOWN)
+ return InternalParse(data, encoding);
+
+ if (!m_SuggestedCharset.empty() && TryParse(data, m_SuggestedCharset))
+ return true;
+
+ std::string detectedCharset;
+ if (CCharsetDetection::DetectXmlEncoding(data, detectedCharset) && TryParse(data, detectedCharset))
+ return true;
+
+ // check for valid UTF-8
+ if (m_SuggestedCharset != "UTF-8" && detectedCharset != "UTF-8" && g_charsetConverter.isValidUtf8(data) &&
+ TryParse(data, "UTF-8"))
+ return true;
+
+ // fallback: try user GUI charset
+ if (TryParse(data, g_langInfo.GetGuiCharSet()))
+ return true;
+
+ // can't detect correct data charset, try to process data as is
+ return InternalParse(data, TIXML_ENCODING_UNKNOWN);
+}
+
+bool CXBMCTinyXML::TryParse(const std::string& data, const std::string& tryDataCharset)
+{
+ if (tryDataCharset == "UTF-8")
+ InternalParse(data, TIXML_ENCODING_UTF8); // process data without conversion
+ else if (!tryDataCharset.empty())
+ {
+ std::string converted;
+ if (!g_charsetConverter.ToUtf8(tryDataCharset, data, converted) || converted.empty())
+ return false; // can't convert data
+
+ InternalParse(converted, TIXML_ENCODING_UTF8);
+ }
+ else
+ InternalParse(data, TIXML_ENCODING_LEGACY);
+
+ // 'Error()' contains result of last run of 'TiXmlDocument::Parse()'
+ if (Error())
+ {
+ Clear();
+ location.Clear();
+
+ return false;
+ }
+
+ m_UsedCharset = tryDataCharset;
+ if (!m_SuggestedCharset.empty() && m_UsedCharset != m_SuggestedCharset)
+ CLog::Log(LOGWARNING, "%s: Using \"%s\" charset instead of \"%s\" charset%s", __FUNCTION__, m_UsedCharset.c_str(), m_SuggestedCharset.c_str(),
+ (value.empty() ? "" : (" for file " + value).c_str()));
+
+ return true;
+}
+
+bool CXBMCTinyXML::InternalParse(const std::string& rawdata, TiXmlEncoding encoding /*= TIXML_DEFAULT_ENCODING */)
{
// Preprocess string, replacing '&' with '& for invalid XML entities
size_t pos = rawdata.find('&');
if (pos == std::string::npos)
- return TiXmlDocument::Parse(rawdata.c_str(), NULL, encoding); // nothing to fix, process data directly
+ return (TiXmlDocument::Parse(rawdata.c_str(), NULL, encoding) != NULL); // nothing to fix, process data directly
std::string data(rawdata);
CRegExp re(false, false, "^&(amp|lt|gt|quot|apos|#x[a-fA-F0-9]{1,4}|#[0-9]{1,5});.*");
pos = data.find('&', pos + 1);
} while (pos != std::string::npos);
- return TiXmlDocument::Parse(data.c_str(), NULL, encoding);
+ return (TiXmlDocument::Parse(data.c_str(), NULL, encoding) != NULL);
}
bool CXBMCTinyXML::Test()
CXBMCTinyXML();
CXBMCTinyXML(const char*);
CXBMCTinyXML(const std::string& documentName);
+ CXBMCTinyXML(const std::string& documentName, const std::string& documentCharset);
bool LoadFile(TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
bool LoadFile(const char*, TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
bool LoadFile(const std::string& _filename, TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
+ bool LoadFile(const std::string& _filename, const std::string& documentCharset);
bool LoadFile(FILE*, TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
bool SaveFile(const char*) const;
bool SaveFile(const std::string& filename) const;
- const char *Parse(const char*, TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
- const char *Parse(const std::string& rawdata, TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
+ bool Parse(const char*, TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
+ bool Parse(const std::string& data, TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
+ bool Parse(const std::string& data, const std::string& dataCharset);
+ inline std::string GetSuggestedCharset(void) const { return m_SuggestedCharset; }
+ inline std::string GetUsedCharset(void) const { return m_UsedCharset; }
static bool Test();
+protected:
+ bool TryParse(const std::string& data, const std::string& tryDataCharset);
+ bool InternalParse(const std::string& rawdata, TiXmlEncoding encoding = TIXML_DEFAULT_ENCODING);
+
+ std::string m_SuggestedCharset;
+ std::string m_UsedCharset;
};
return bResult;
}
-/*!
- Returns true if the encoding of the document is other then UTF-8.
- /param strEncoding Returns the encoding of the document. Empty if UTF-8
-*/
-bool XMLUtils::GetEncoding(const CXBMCTinyXML* pDoc, CStdString& strEncoding)
-{
- const TiXmlNode* pNode=NULL;
- while ((pNode=pDoc->IterateChildren(pNode)) && pNode->Type()!=TiXmlNode::TINYXML_DECLARATION) {}
- if (!pNode) return false;
- const TiXmlDeclaration* pDecl=pNode->ToDeclaration();
- if (!pDecl) return false;
- strEncoding=pDecl->Encoding();
- if (strEncoding.Equals("UTF-8") || strEncoding.Equals("UTF8")) strEncoding.Empty();
- strEncoding.MakeUpper();
- return !strEncoding.IsEmpty(); // Other encoding then UTF8?
-}
-
-/*!
- Returns true if the encoding of the document is specified as as UTF-8
- /param strXML The XML file (embedded in a string) to check.
-*/
-bool XMLUtils::HasUTF8Declaration(const CStdString &strXML)
-{
- CStdString test = strXML;
- test.ToLower();
- // test for the encoding="utf-8" string
- if (test.Find("encoding=\"utf-8\"") >= 0)
- return true;
- // TODO: test for plain UTF8 here?
- return false;
-}
-
bool XMLUtils::GetPath(const TiXmlNode* pRootNode, const char* strTag, CStdString& strStringValue)
{
const TiXmlElement* pElement = pRootNode->FirstChildElement(strTag);
class XMLUtils
{
public:
- static bool HasUTF8Declaration(const CStdString &strXML);
static bool HasChild(const TiXmlNode* pRootNode, const char* strTag);
static bool GetHex(const TiXmlNode* pRootNode, const char* strTag, uint32_t& dwHexValue);
*/
static bool GetAdditiveString(const TiXmlNode* rootNode, const char* tag, const CStdString& separator, CStdString& value, bool clear = false);
static bool GetStringArray(const TiXmlNode* rootNode, const char* tag, std::vector<std::string>& arrayValue, bool clear = false, const std::string& separator = "");
- static bool GetEncoding(const CXBMCTinyXML* pDoc, CStdString& strEncoding);
static bool GetPath(const TiXmlNode* pRootNode, const char* strTag, CStdString& strStringValue);
static bool GetFloat(const TiXmlNode* pRootNode, const char* strTag, float& value, const float min, const float max);
static bool GetUInt(const TiXmlNode* pRootNode, const char* strTag, uint32_t& dwUIntValue, const uint32_t min, const uint32_t max);