DVDSubtitleStream: better detect stream encoding and optimization
authorKarlson2k <k2k@narod.ru>
Thu, 21 Nov 2013 21:49:47 +0000 (01:49 +0400)
committerKarlson2k <k2k@narod.ru>
Sat, 28 Dec 2013 15:12:04 +0000 (19:12 +0400)
Use BOM detection for all supported charsets, not only UTF-8 and UTF-16LE
Do not double convert subtitle->wstring->UTF-8. Use direct conversion subtitle->UTF-8
additionally: CharsetConverter: subtitleCharsetToW -> subtitleCharsetToUtf8

xbmc/cores/dvdplayer/DVDSubtitles/DVDSubtitleStream.cpp
xbmc/utils/CharsetConverter.cpp
xbmc/utils/CharsetConverter.h

index 30756cf..5218d01 100644 (file)
 #include "DVDInputStreams/DVDInputStream.h"
 #include "utils/CharsetConverter.h"
 #include "utils/Utf8Utils.h"
+#include "utils/CharsetDetection.h"
+#include "filesystem/File.h"
 
 using namespace std;
+using XFILE::auto_buffer;
 
 CDVDSubtitleStream::CDVDSubtitleStream()
 {
@@ -40,60 +43,51 @@ bool CDVDSubtitleStream::Open(const string& strFile)
   pInputStream = CDVDFactoryInputStream::CreateInputStream(NULL, strFile, "");
   if (pInputStream && pInputStream->Open(strFile.c_str(), ""))
   {
-    unsigned char buffer[16384];
-    int size_read = 0;
-    size_read = pInputStream->Read(buffer,3);
-    bool isUTF8 = false;
-    bool isUTF16 = false;
-    if (buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF)
-      isUTF8 = true;
-    else if (buffer[0] == 0xFF && buffer[1] == 0xFE)
+    static const size_t chunksize = 64 * 1024;
+    auto_buffer buf;
+
+    // read content
+    size_t totalread = 0;
+    int read;
+    do
     {
-      isUTF16 = true;
-      pInputStream->Seek(2, SEEK_SET);
-    }
-    else
-      pInputStream->Seek(0, SEEK_SET);
+      if (totalread == buf.size())
+        buf.resize(buf.size() + chunksize);
+
+      read = pInputStream->Read((uint8_t*)buf.get() + totalread, buf.size() - totalread);
+      if (read > 0)
+        totalread += read;
+    } while (read > 0);
+
+    delete pInputStream;
+    if (!totalread)
+      return false;
 
-    if (isUTF16)
+    std::string tmpStr(buf.get(), totalread);
+    buf.clear();
+
+    std::string enc(CCharsetDetection::GetBomEncoding(tmpStr));
+    if (enc == "UTF-8" || (enc.empty() && CUtf8Utils::isValidUtf8(tmpStr)))
+      m_stringstream << tmpStr;
+    else if (!enc.empty())
     {
-      std::wstringstream wstringstream;
-      while( (size_read = pInputStream->Read(buffer, sizeof(buffer)-2) ) > 0 )
-      {
-        buffer[size_read] = buffer[size_read + 1] = '\0';
-        CStdStringW temp; 
-        g_charsetConverter.utf16LEtoW(std::u16string((char16_t*)buffer),temp); 
-        wstringstream << temp; 
-      }
-      delete pInputStream;
-
-      CStdString strUTF8;
-      g_charsetConverter.wToUTF8(CStdStringW(wstringstream.str()),strUTF8);
-      m_stringstream.str("");
-      m_stringstream << strUTF8;
+      std::string converted;
+      g_charsetConverter.ToUtf8(enc, tmpStr, converted);
+      if (converted.empty())
+        return false;
+
+      m_stringstream << converted;
     }
     else
     {
-      while( (size_read = pInputStream->Read(buffer, sizeof(buffer)-1) ) > 0 )
-      {
-        buffer[size_read] = '\0';
-        m_stringstream << buffer;
-      }
-      delete pInputStream;
-
-      if (!isUTF8)
-        isUTF8 = CUtf8Utils::isValidUtf8(m_stringstream.str());
-
-      if (!isUTF8)
-      {
-        CStdStringW strUTF16;
-        CStdString strUTF8;
-        g_charsetConverter.subtitleCharsetToW(m_stringstream.str(), strUTF16);
-        g_charsetConverter.wToUTF8(strUTF16,strUTF8);
-        m_stringstream.str("");
-        m_stringstream << strUTF8;
-      }
+      std::string converted;
+      g_charsetConverter.subtitleCharsetToUtf8(tmpStr, converted);
+      if (converted.empty())
+        return false;
+
+      m_stringstream << converted;
     }
+
     return true;
   }
 
index 5fb0c95..aa75d0a 100644 (file)
@@ -271,7 +271,7 @@ enum StdConversionType /* Keep it in sync with CCharsetConverter::CInnerConverte
   Utf32ToUtf8,
   Utf32ToW,
   WToUtf32,
-  SubtitleCharsetToW,
+  SubtitleCharsetToUtf8,
   Utf8ToUserCharset,
   UserCharsetToUtf8,
   Utf32ToUserCharset,
@@ -315,7 +315,7 @@ CConverterType CCharsetConverter::CInnerConverter::m_stdConversion[NumberOfStdCo
   /* Utf32ToUtf8 */         CConverterType(UTF32_CHARSET,   "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
   /* Utf32ToW */            CConverterType(UTF32_CHARSET,   WCHAR_CHARSET),
   /* WToUtf32 */            CConverterType(WCHAR_CHARSET,   UTF32_CHARSET),
-  /* SubtitleCharsetToW */  CConverterType(SubtitleCharset, WCHAR_CHARSET),
+  /* SubtitleCharsetToUtf8*/CConverterType(SubtitleCharset, "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
   /* Utf8ToUserCharset */   CConverterType(UTF8_SOURCE,     UserCharset),
   /* UserCharsetToUtf8 */   CConverterType(UserCharset,     "UTF-8", CCharsetConverter::m_Utf8CharMaxSize),
   /* Utf32ToUserCharset */  CConverterType(UTF32_CHARSET,   UserCharset),
@@ -654,7 +654,7 @@ void CCharsetConverter::resetUserCharset(void)
 
 void CCharsetConverter::resetSubtitleCharset(void)
 {
-  CInnerConverter::m_stdConversion[SubtitleCharsetToW].Reset();
+  CInnerConverter::m_stdConversion[SubtitleCharsetToUtf8].Reset();
 }
 
 void CCharsetConverter::resetKaraokeCharset(void)
@@ -749,9 +749,9 @@ bool CCharsetConverter::utf8ToW(const std::string& utf8StringSrc, std::wstring&
   return CInnerConverter::stdConvert(Utf8toW, utf8StringSrc, wStringDst, failOnBadChar);
 }
 
-bool CCharsetConverter::subtitleCharsetToW(const std::string& stringSrc, std::wstring& wStringDst)
+bool CCharsetConverter::subtitleCharsetToUtf8(const std::string& stringSrc, std::string& utf8StringDst)
 {
-  return CInnerConverter::stdConvert(SubtitleCharsetToW, stringSrc, wStringDst, false);
+  return CInnerConverter::stdConvert(SubtitleCharsetToUtf8, stringSrc, utf8StringDst, false);
 }
 
 bool CCharsetConverter::fromW(const std::wstring& wStringSrc,
index 325daa8..f80411f 100644 (file)
@@ -131,7 +131,7 @@ public:
 
   static bool utf16LEtoW(const std::u16string& utf16String, std::wstring& wString);
 
-  static bool subtitleCharsetToW(const std::string& stringSrc, std::wstring& wStringDst);
+  static bool subtitleCharsetToUtf8(const std::string& stringSrc, std::string& utf8StringDst);
 
   static bool utf8ToStringCharset(const std::string& utf8StringSrc, std::string& stringDst);