Merge pull request #3650 from Karlson2k/fix_pcre_utf8

author jmarshallnz <jcmarsha@gmail.com>

Tue, 10 Dec 2013 00:17:31 +0000 (16:17 -0800)

committer jmarshallnz <jcmarsha@gmail.com>

Tue, 10 Dec 2013 00:17:31 +0000 (16:17 -0800)
author jmarshallnz <jcmarsha@gmail.com>
Tue, 10 Dec 2013 00:17:31 +0000 (16:17 -0800)
committer jmarshallnz <jcmarsha@gmail.com>
Tue, 10 Dec 2013 00:17:31 +0000 (16:17 -0800)
diff --git a/xbmc/FileItem.cpp b/xbmc/FileItem.cpp

index 6a80645..73a7949 100644 (file)
--- a/xbmc/FileItem.cpp
+++ b/xbmc/FileItem.cpp
@@ -2324,7 +2324,7 @@ void CFileItemList::StackFolders()
  {
    // Precompile our REs
    VECCREGEXP folderRegExps;
-  CRegExp folderRegExp(true, true);
+  CRegExp folderRegExp(true, CRegExp::autoUtf8);
    const CStdStringArray& strFolderRegExps = g_advancedSettings.m_folderStackRegExps;
  
    CStdStringArray::const_iterator strExpression = strFolderRegExps.begin();
@@ -2416,7 +2416,7 @@ void CFileItemList::StackFiles()
  {
    // Precompile our REs
    VECCREGEXP stackRegExps;
-  CRegExp tmpRegExp(true, true);
+  CRegExp tmpRegExp(true, CRegExp::autoUtf8);
    const CStdStringArray& strStackRegExps = g_advancedSettings.m_videoStackRegExps;
    CStdStringArray::const_iterator strRegExp = strStackRegExps.begin();
    while (strRegExp != strStackRegExps.end())
@@ -3242,7 +3242,7 @@ CStdString CFileItem::FindTrailer() const
  
    // Precompile our REs
    VECCREGEXP matchRegExps;
-  CRegExp tmpRegExp(true, true);
+  CRegExp tmpRegExp(true, CRegExp::autoUtf8);
    const CStdStringArray& strMatchRegExps = g_advancedSettings.m_trailerMatchRegExps;
  
    CStdStringArray::const_iterator strRegExp = strMatchRegExps.begin();
diff --git a/xbmc/Util.cpp b/xbmc/Util.cpp

index ae6eaa6..68d0188 100644 (file)
--- a/xbmc/Util.cpp
+++ b/xbmc/Util.cpp
@@ -243,8 +243,8 @@ void CUtil::CleanString(const CStdString& strFileName, CStdString& strTitle, CSt
  
    const CStdStringArray &regexps = g_advancedSettings.m_videoCleanStringRegExps;
  
-  CRegExp reTags(true, true);
-  CRegExp reYear(false, true);
+  CRegExp reTags(true, CRegExp::autoUtf8);
+  CRegExp reYear(false, CRegExp::autoUtf8);
  
    if (!reYear.RegComp(g_advancedSettings.m_videoCleanDateTimeRegExp))
    {
@@ -519,7 +519,7 @@ bool CUtil::ExcludeFileOrFolder(const CStdString& strFileOrFolder, const CStdStr
    if (strFileOrFolder.empty())
      return false;
  
-  CRegExp regExExcludes(true, true);  // case insensitive regex
+  CRegExp regExExcludes(true, CRegExp::autoUtf8);  // case insensitive regex
  
    for (unsigned int i = 0; i < regexps.size(); i++)
    {
diff --git a/xbmc/cores/ExternalPlayer/ExternalPlayer.cpp b/xbmc/cores/ExternalPlayer/ExternalPlayer.cpp

index 7b72a19..4301a23 100644 (file)
--- a/xbmc/cores/ExternalPlayer/ExternalPlayer.cpp
+++ b/xbmc/cores/ExternalPlayer/ExternalPlayer.cpp
@@ -185,7 +185,7 @@ void CExternalPlayer::Process()
        CStdString strMatch = vecSplit[0];
        StringUtils::Replace(strMatch, ",,",",");
        bool bCaseless = vecSplit[3].find('i') != std::string::npos;
-      CRegExp regExp(bCaseless, true);
+      CRegExp regExp(bCaseless, CRegExp::autoUtf8);
  
        if (!regExp.RegComp(strMatch.c_str()))
        { // invalid regexp - complain in logs
diff --git a/xbmc/cores/playercorefactory/PlayerSelectionRule.cpp b/xbmc/cores/playercorefactory/PlayerSelectionRule.cpp

index a0add7c..63a0125 100644 (file)
--- a/xbmc/cores/playercorefactory/PlayerSelectionRule.cpp
+++ b/xbmc/cores/playercorefactory/PlayerSelectionRule.cpp
@@ -118,7 +118,7 @@ void CPlayerSelectionRule::GetPlayers(const CFileItem& item, VECPLAYERCORES &vec
    if (m_tDVDFile >= 0 && (m_tDVDFile > 0) != item.IsDVDFile()) return;
    if (m_tDVDImage >= 0 && (m_tDVDImage > 0) != item.IsDVDImage()) return;
  
-  CRegExp regExp(false, true);
+  CRegExp regExp(false, CRegExp::autoUtf8);
  
    if (m_bStreamDetails)
    {
diff --git a/xbmc/filesystem/StackDirectory.cpp b/xbmc/filesystem/StackDirectory.cpp

index 8b06ce8..3d1bbac 100644 (file)
--- a/xbmc/filesystem/StackDirectory.cpp
+++ b/xbmc/filesystem/StackDirectory.cpp
@@ -59,7 +59,7 @@ namespace XFILE
    {
      // Load up our REs
      VECCREGEXP  RegExps;
-    CRegExp     tempRE(true, true);
+    CRegExp     tempRE(true, CRegExp::autoUtf8);
      const CStdStringArray& strRegExps = g_advancedSettings.m_videoStackRegExps;
      CStdStringArray::const_iterator itRegExp = strRegExps.begin();
      vector<pair<int, CStdString> > badStacks;
diff --git a/xbmc/utils/RegExp.cpp b/xbmc/utils/RegExp.cpp

index 5afa971..ee2f462 100644 (file)
--- a/xbmc/utils/RegExp.cpp
+++ b/xbmc/utils/RegExp.cpp
@@ -53,19 +53,20 @@ int CRegExp::m_UcpSupported  = -1;
  int CRegExp::m_JitSupported  = -1;
  
  
-CRegExp::CRegExp(bool caseless /*= false*/, bool utf8 /*= false*/)
+CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
  {
    InitValues(caseless, utf8);
  }
  
-void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/)
+void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
  {
+  m_utf8Mode    = utf8;
    m_re          = NULL;
    m_sd          = NULL;
    m_iOptions    = PCRE_DOTALL | PCRE_NEWLINE_ANY;
    if(caseless)
      m_iOptions |= PCRE_CASELESS;
-  if (utf8)
+  if (m_utf8Mode == forceUtf8)
    {
      if (IsUtf8Supported())
        m_iOptions |= PCRE_UTF8;
@@ -82,17 +83,162 @@ void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/)
    memset(m_iOvector, 0, sizeof(m_iOvector));
  }
  
-CRegExp::CRegExp(bool caseless, bool utf8, const char *re, studyMode study /*= NoStudy*/)
+CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/)
  {
+  if (utf8 == autoUtf8)
+    utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly;
+
    InitValues(caseless, utf8);
    RegComp(re, study);
  }
  
+bool CRegExp::requireUtf8(const std::string& regexp)
+{
+  // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences
+  if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string)
+    return true;
+
+  // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..}
+  // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled,
+  //       but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code
+  const char* const regexpC = regexp.c_str();
+  const size_t len = regexp.length();
+  size_t pos = 0;
+
+  while (pos < len)
+  {
+    const char chr = regexpC[pos];
+    if (chr == '\\')
+    {
+      const char nextChr = regexpC[pos + 1];
+
+      if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X')
+        return true; // found Unicode Properties
+      else if (nextChr == 'Q')
+        pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E"
+      else if (nextChr == 'x' && regexpC[pos + 2] == '{')
+      { // Unicode character with hex code
+        if (readCharXCode(regexp, pos) >= 0x100)
+          return true; // found Unicode character code
+      }
+      else if (nextChr == '\\' || nextChr == '(' || nextChr == ')'
+               || nextChr == '[' || nextChr == ']')
+               pos++; // exclude next character from analyze
+
+    } // chr != '\\'
+    else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp
+      pos = regexp.find(')', pos); // skip comment
+    else if (chr == '[')
+    {
+      if (isCharClassWithUnicode(regexp, pos))
+        return true;
+    }
+
+    if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode
+      return false;
+
+    pos++;
+  }
+
+  // no Unicode Properties was found
+  return false;
+}
+
+inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos)
+{
+  // read hex character code in form "\x{hh..}"
+  // 'pos' must point to '\'
+  if (pos >= regexp.length())
+    return -1;
+  const char* const regexpC = regexp.c_str();
+  if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{')
+    return -1;
+
+  pos++;
+  const size_t startPos = pos; // 'startPos' points to 'x'
+  const size_t closingBracketPos = regexp.find('}', startPos + 2);
+  if (closingBracketPos == std::string::npos)
+    return 0; // return character zero code, leave 'pos' at 'x'
+
+  pos++; // 'pos' points to '{'
+  int chCode = 0;
+  while (++pos < closingBracketPos)
+  {
+    const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]);
+    if (xdigitVal >= 0)
+      chCode = chCode * 16 + xdigitVal;
+    else
+    { // found non-hexdigit
+      pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code
+      return 0; // return character zero code
+    }
+  }
+
+  return chCode;
+}
+
+bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos)
+{
+  const char* const regexpC = regexp.c_str();
+  const size_t len = regexp.length();
+  if (pos > len || regexpC[pos] != '[')
+    return false;
+
+  // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X"
+  // find end (terminating ']') of character class (like "[a-h45]")
+  // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]"
+  bool needUnicode = false;
+  while (++pos < len)
+  {
+    if (regexpC[pos] == '[' && regexpC[pos + 1] == ':')
+    { // possible POSIX character class, like "[:alpha:]"
+      const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class
+
+      if (nextClosingBracketPos == std::string::npos)
+      { // error in regexp: no closing ']' for character class
+        pos = std::string::npos;
+        return needUnicode;
+      }
+      else if (regexpC[nextClosingBracketPos - 1] == ':')
+        pos = nextClosingBracketPos; // skip POSIX character class
+      // if ":]" is not found, process "[:..." as part of normal character class
+    }
+    else if (regexpC[pos] == ']')
+      return needUnicode; // end of character class
+    else if (regexpC[pos] == '\\')
+    {
+      const char nextChar = regexpC[pos + 1];
+      if (nextChar == ']' || nextChar == '[')
+        pos++; // skip next character
+      else if (nextChar == 'Q')
+      {
+        pos = regexp.find("\\E", pos + 2);
+        if (pos == std::string::npos)
+          return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class
+        else
+          pos++; // skip "\E"
+      }
+      else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X')
+        needUnicode = true; // don't care about property name as it can contain only ASCII chars
+      else if (nextChar == 'x')
+      {
+        if (readCharXCode(regexp, pos) >= 0x100)
+          needUnicode = true;
+      }
+    }
+  }
+  pos = std::string::npos; // closing square bracket was not found
+
+  return needUnicode;
+}
+
+
  CRegExp::CRegExp(const CRegExp& re)
  {
    m_re = NULL;
    m_sd = NULL;
    m_jitStack = NULL;
+  m_utf8Mode = re.m_utf8Mode;
    m_iOptions = re.m_iOptions;
    *this = re;
  }
@@ -140,10 +286,13 @@ bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
    m_iMatchCount      = 0;
    const char *errMsg = NULL;
    int errOffset      = 0;
+  int options        = m_iOptions;
+  if (m_utf8Mode == autoUtf8 && requireUtf8(re))
+    options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0);
  
    Cleanup();
  
-  m_re = pcre_compile(re, m_iOptions, &errMsg, &errOffset, NULL);
+  m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL);
    if (!m_re)
    {
      m_pattern.clear();
diff --git a/xbmc/utils/RegExp.h b/xbmc/utils/RegExp.h

index d231669..de1ce28 100644 (file)
--- a/xbmc/utils/RegExp.h
+++ b/xbmc/utils/RegExp.h
@@ -48,25 +48,32 @@ public:
      StudyRegExp      = 1, // study expression (slower compilation, faster find)
      StudyWithJitComp      // study expression and JIT-compile it, if possible (heavyweight optimization) 
    };
+  enum utf8Mode
+  {
+    autoUtf8  = -1, // analyze regexp for UTF-8 multi-byte chars, for Unicode codes > 0xFF
+                    // or explicit Unicode properties (\p, \P and \X), enable UTF-8 mode if any of them are found
+    asciiOnly =  0, // process regexp and strings as single-byte encoded strings
+    forceUtf8 =  1  // enable UTF-8 mode (with Unicode properties)
+  };
  
    static const int m_MaxNumOfBackrefrences = 20;
    /**
     * @param caseless (optional) Matching will be case insensitive if set to true
     *                            or case sensitive if set to false
-   * @param utf8 (optional) If set to true all string will be processed as UTF-8 strings 
+   * @param utf8 (optional) Control UTF-8 processing
     */
-  CRegExp(bool caseless = false, bool utf8 = false);
+  CRegExp(bool caseless = false, utf8Mode utf8 = asciiOnly);
    /**
     * Create new CRegExp object and compile regexp expression in one step
     * @warning Use only with hardcoded regexp when you're sure that regexp is compiled without errors
     * @param caseless    Matching will be case insensitive if set to true 
     *                    or case sensitive if set to false
-   * @param utf8        If set to true all string will be processed as UTF-8 strings
+   * @param utf8        Control UTF-8 processing
     * @param re          The regular expression
     * @param study (optional) Controls study of expression, useful if expression will be used
     *                         several times
     */
-  CRegExp(bool caseless, bool utf8, const char *re, studyMode study = NoStudy);
+  CRegExp(bool caseless, utf8Mode utf8, const char *re, studyMode study = NoStudy);
  
    CRegExp(const CRegExp& re);
    ~CRegExp();
@@ -143,7 +150,10 @@ public:
  
  private:
    int PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset = 0, int maxNumberOfCharsToTest = -1);
-  void InitValues(bool caseless = false, bool utf8 = false);
+  void InitValues(bool caseless = false, CRegExp::utf8Mode utf8 = asciiOnly);
+  static bool requireUtf8(const std::string& regexp);
+  static int readCharXCode(const std::string& regexp, size_t& pos);
+  static bool isCharClassWithUnicode(const std::string& regexp, size_t& pos);
  
    void Cleanup();
    inline bool IsValidSubNumber(int iSub) const;
@@ -153,6 +163,7 @@ private:
    static const int OVECCOUNT=(m_MaxNumOfBackrefrences + 1) * 3;
    unsigned int m_offset;
    int         m_iOvector[OVECCOUNT];
+  utf8Mode    m_utf8Mode;
    int         m_iMatchCount;
    int         m_iOptions;
    bool        m_jitCompiled;
diff --git a/xbmc/utils/ScraperParser.cpp b/xbmc/utils/ScraperParser.cpp

index e424fcc..941e9ed 100644 (file)
--- a/xbmc/utils/ScraperParser.cpp
+++ b/xbmc/utils/ScraperParser.cpp
@@ -204,7 +204,19 @@ void CScraperParser::ParseExpression(const CStdString& input, CStdString& dest,
        if (stricmp(sensitive,"yes") == 0)
          bInsensitive=false; // match case sensitive
  
-    CRegExp reg(bInsensitive, true);
+    CRegExp::utf8Mode eUtf8 = CRegExp::autoUtf8;
+    const char* const strUtf8 = pExpression->Attribute("utf8");
+    if (strUtf8)
+    {
+      if (stricmp(strUtf8, "yes") == 0)
+        eUtf8 = CRegExp::forceUtf8;
+      else if (stricmp(strUtf8, "no") == 0)
+        eUtf8 = CRegExp::asciiOnly;
+      else if (stricmp(strUtf8, "auto") == 0)
+        eUtf8 = CRegExp::autoUtf8;
+    }
+
+    CRegExp reg(bInsensitive, eUtf8);
      CStdString strExpression;
      if (pExpression->FirstChild())
        strExpression = pExpression->FirstChild()->Value();
diff --git a/xbmc/utils/StringUtils.cpp b/xbmc/utils/StringUtils.cpp

index ca84fdd..33d298b 100644 (file)
--- a/xbmc/utils/StringUtils.cpp
+++ b/xbmc/utils/StringUtils.cpp
@@ -733,6 +733,28 @@ bool StringUtils::IsInteger(const CStdString& str)
    return i == str.size() && n > 0;
  }
  
+int StringUtils::asciidigitvalue(char chr)
+{
+  if (!isasciidigit(chr))
+    return -1;
+
+  return chr - '0';
+}
+
+int StringUtils::asciixdigitvalue(char chr)
+{
+  int v = asciidigitvalue(chr);
+  if (v >= 0)
+    return v;
+  if (chr >= 'a' && chr <= 'f')
+    return chr - 'a' + 10;
+  if (chr >= 'A' && chr <= 'F')
+    return chr - 'A' + 10;
+
+  return -1;
+}
+
+
  void StringUtils::RemoveCRLF(CStdString& strLine)
  {
    StringUtils::TrimRight(strLine, "\n\r");
diff --git a/xbmc/utils/StringUtils.h b/xbmc/utils/StringUtils.h

index 4f5d891..54c835a 100644 (file)
--- a/xbmc/utils/StringUtils.h
+++ b/xbmc/utils/StringUtils.h
@@ -125,6 +125,32 @@ public:
     \return true if the string is an integer, false otherwise.
     */
    static bool IsInteger(const CStdString& str);
+
+  /* The next several isasciiXX and asciiXXvalue functions are locale independent (US-ASCII only),
+   * as opposed to standard ::isXX (::isalpha, ::isdigit...) which are locale dependent.
+   * Next functions get parameter as char and don't need double cast ((int)(unsigned char) is required for standard functions). */
+  inline static bool isasciidigit(char chr) // locale independent 
+  {
+    return chr >= '0' && chr <= '9'; 
+  }
+  inline static bool isasciixdigit(char chr) // locale independent 
+  {
+    return (chr >= '0' && chr <= '9') || (chr >= 'a' && chr <= 'f') || (chr >= 'A' && chr <= 'F'); 
+  }
+  static int asciidigitvalue(char chr); // locale independent 
+  static int asciixdigitvalue(char chr); // locale independent 
+  inline static bool isasciiuppercaseletter(char chr) // locale independent
+  {
+    return (chr >= 'A' && chr <= 'Z'); 
+  }
+  inline static bool isasciilowercaseletter(char chr) // locale independent
+  {
+    return (chr >= 'a' && chr <= 'z'); 
+  }
+  inline static bool isasciialphanum(char chr) // locale independent
+  {
+    return isasciiuppercaseletter(chr) || isasciilowercaseletter(chr) || isasciidigit(chr); 
+  }
    static CStdString SizeToString(int64_t size);
    static const CStdString EmptyString;
    static const std::string Empty;
diff --git a/xbmc/utils/XBMCTinyXML.cpp b/xbmc/utils/XBMCTinyXML.cpp

index 7101710..e29f8e5 100644 (file)
--- a/xbmc/utils/XBMCTinyXML.cpp
+++ b/xbmc/utils/XBMCTinyXML.cpp
@@ -212,7 +212,7 @@ bool CXBMCTinyXML::InternalParse(const std::string& rawdata, TiXmlEncoding encod
      return (TiXmlDocument::Parse(rawdata.c_str(), NULL, encoding) != NULL); // nothing to fix, process data directly
  
    std::string data(rawdata);
-  CRegExp re(false, false, "^&(amp|lt|gt|quot|apos|#x[a-fA-F0-9]{1,4}|#[0-9]{1,5});.*");
+  CRegExp re(false, CRegExp::asciiOnly, "^&(amp|lt|gt|quot|apos|#x[a-fA-F0-9]{1,4}|#[0-9]{1,5});.*");
    do
    {
      if (re.RegFind(data, pos, MAX_ENTITY_LENGTH) < 0)
diff --git a/xbmc/video/VideoInfoScanner.cpp b/xbmc/video/VideoInfoScanner.cpp

index 4e2780a..62569cb 100644 (file)
--- a/xbmc/video/VideoInfoScanner.cpp
+++ b/xbmc/video/VideoInfoScanner.cpp
@@ -870,7 +870,7 @@ namespace VIDEO
  
      for (unsigned int i=0;i<expression.size();++i)
      {
-      CRegExp reg(true, true);
+      CRegExp reg(true, CRegExp::autoUtf8);
        if (!reg.RegComp(expression[i].regexp))
          continue;
  
@@ -939,7 +939,7 @@ namespace VIDEO
        // add what we found by now
        episodeList.push_back(episode);
  
-      CRegExp reg2(true, true);
+      CRegExp reg2(true, CRegExp::autoUtf8);
        // check the remainder of the string for any further episodes.
        if (!byDate && reg2.RegComp(g_advancedSettings.m_tvshowMultiPartEnumRegExp))
        {
author	jmarshallnz <jcmarsha@gmail.com>
	Tue, 10 Dec 2013 00:17:31 +0000 (16:17 -0800)
committer	jmarshallnz <jcmarsha@gmail.com>
	Tue, 10 Dec 2013 00:17:31 +0000 (16:17 -0800)
xbmc/FileItem.cpp		patch \| blob \| history
xbmc/Util.cpp		patch \| blob \| history
xbmc/cores/ExternalPlayer/ExternalPlayer.cpp		patch \| blob \| history
xbmc/cores/playercorefactory/PlayerSelectionRule.cpp		patch \| blob \| history
xbmc/filesystem/StackDirectory.cpp		patch \| blob \| history
xbmc/utils/RegExp.cpp		patch \| blob \| history
xbmc/utils/RegExp.h		patch \| blob \| history
xbmc/utils/ScraperParser.cpp		patch \| blob \| history
xbmc/utils/StringUtils.cpp		patch \| blob \| history
xbmc/utils/StringUtils.h		patch \| blob \| history
xbmc/utils/XBMCTinyXML.cpp		patch \| blob \| history
xbmc/video/VideoInfoScanner.cpp		patch \| blob \| history