2 * Copyright (C) 2005-2013 Team XBMC
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with XBMC; see the file COPYING. If not, see
17 * <http://www.gnu.org/licenses/>.
25 #include "StdString.h"
27 #include "utils/StringUtils.h"
28 #include "utils/Utf8Utils.h"
36 #ifdef PCRE_CONFIG_JIT
37 #define PCRE_HAS_JIT_CODE 1
40 #ifndef PCRE_STUDY_JIT_COMPILE
41 #define PCRE_STUDY_JIT_COMPILE 0
45 #define PCRE_INFO_JIT 2048
47 #ifndef PCRE_HAS_JIT_CODE
48 #define pcre_free_study(x) pcre_free((x))
51 int CRegExp::m_Utf8Supported = -1;
52 int CRegExp::m_UcpSupported = -1;
53 int CRegExp::m_JitSupported = -1;
56 CRegExp::CRegExp(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
58 InitValues(caseless, utf8);
61 void CRegExp::InitValues(bool caseless /*= false*/, CRegExp::utf8Mode utf8 /*= asciiOnly*/)
66 m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY;
68 m_iOptions |= PCRE_CASELESS;
69 if (m_utf8Mode == forceUtf8)
71 if (IsUtf8Supported())
72 m_iOptions |= PCRE_UTF8;
73 if (AreUnicodePropertiesSupported())
74 m_iOptions |= PCRE_UCP;
78 m_jitCompiled = false;
83 memset(m_iOvector, 0, sizeof(m_iOvector));
86 CRegExp::CRegExp(bool caseless, CRegExp::utf8Mode utf8, const char *re, studyMode study /*= NoStudy*/)
89 utf8 = requireUtf8(re) ? forceUtf8 : asciiOnly;
91 InitValues(caseless, utf8);
95 bool CRegExp::requireUtf8(const std::string& regexp)
97 // enable UTF-8 mode if regexp string has UTF-8 multibyte sequences
98 if (CUtf8Utils::checkStrForUtf8(regexp) == CUtf8Utils::utf8string)
101 // check for explicit Unicode Properties (\p, \P, \X) and for Unicode character codes (greater than 0xFF) in form \x{hhh..}
102 // note: PCRE change meaning of \w, \s, \d (and \W, \S, \D) when Unicode Properties are enabled,
103 // but in auto mode we enable UNP for US-ASCII regexp only if regexp contains explicit \p, \P, \X or Unicode character code
104 const char* const regexpC = regexp.c_str();
105 const size_t len = regexp.length();
110 const char chr = regexpC[pos];
113 const char nextChr = regexpC[pos + 1];
115 if (nextChr == 'p' || nextChr == 'P' || nextChr == 'X')
116 return true; // found Unicode Properties
117 else if (nextChr == 'Q')
118 pos = regexp.find("\\E", pos + 2); // skip all literals in "\Q...\E"
119 else if (nextChr == 'x' && regexpC[pos + 2] == '{')
120 { // Unicode character with hex code
121 if (readCharXCode(regexp, pos) >= 0x100)
122 return true; // found Unicode character code
124 else if (nextChr == '\\' || nextChr == '(' || nextChr == ')'
125 || nextChr == '[' || nextChr == ']')
126 pos++; // exclude next character from analyze
129 else if (chr == '(' && regexpC[pos + 1] == '?' && regexpC[pos + 2] == '#') // comment in regexp
130 pos = regexp.find(')', pos); // skip comment
133 if (isCharClassWithUnicode(regexp, pos))
137 if (pos == std::string::npos) // check results of regexp.find() and isCharClassWithUnicode
143 // no Unicode Properties was found
147 inline int CRegExp::readCharXCode(const std::string& regexp, size_t& pos)
149 // read hex character code in form "\x{hh..}"
150 // 'pos' must point to '\'
151 if (pos >= regexp.length())
153 const char* const regexpC = regexp.c_str();
154 if (regexpC[pos] != '\\' || regexpC[pos + 1] != 'x' || regexpC[pos + 2] != '{')
158 const size_t startPos = pos; // 'startPos' points to 'x'
159 const size_t closingBracketPos = regexp.find('}', startPos + 2);
160 if (closingBracketPos == std::string::npos)
161 return 0; // return character zero code, leave 'pos' at 'x'
163 pos++; // 'pos' points to '{'
165 while (++pos < closingBracketPos)
167 const int xdigitVal = StringUtils::asciixdigitvalue(regexpC[pos]);
169 chCode = chCode * 16 + xdigitVal;
171 { // found non-hexdigit
172 pos = startPos; // reset 'pos' to 'startPos', process "{hh..}" as non-code
173 return 0; // return character zero code
180 bool CRegExp::isCharClassWithUnicode(const std::string& regexp, size_t& pos)
182 const char* const regexpC = regexp.c_str();
183 const size_t len = regexp.length();
184 if (pos > len || regexpC[pos] != '[')
187 // look for Unicode character code "\x{hhh..}" and Unicode properties "\P", "\p" and "\X"
188 // find end (terminating ']') of character class (like "[a-h45]")
189 // detect nested POSIX classes like "[[:lower:]]" and escaped brackets like "[\]]"
190 bool needUnicode = false;
193 if (regexpC[pos] == '[' && regexpC[pos + 1] == ':')
194 { // possible POSIX character class, like "[:alpha:]"
195 const size_t nextClosingBracketPos = regexp.find(']', pos + 2); // don't care about "\]", as it produce error if used inside POSIX char class
197 if (nextClosingBracketPos == std::string::npos)
198 { // error in regexp: no closing ']' for character class
199 pos = std::string::npos;
202 else if (regexpC[nextClosingBracketPos - 1] == ':')
203 pos = nextClosingBracketPos; // skip POSIX character class
204 // if ":]" is not found, process "[:..." as part of normal character class
206 else if (regexpC[pos] == ']')
207 return needUnicode; // end of character class
208 else if (regexpC[pos] == '\\')
210 const char nextChar = regexpC[pos + 1];
211 if (nextChar == ']' || nextChar == '[')
212 pos++; // skip next character
213 else if (nextChar == 'Q')
215 pos = regexp.find("\\E", pos + 2);
216 if (pos == std::string::npos)
217 return needUnicode; // error in regexp: no closing "\E" after "\Q" in character class
221 else if (nextChar == 'p' || nextChar == 'P' || nextChar == 'X')
222 needUnicode = true; // don't care about property name as it can contain only ASCII chars
223 else if (nextChar == 'x')
225 if (readCharXCode(regexp, pos) >= 0x100)
230 pos = std::string::npos; // closing square bracket was not found
236 CRegExp::CRegExp(const CRegExp& re)
241 m_utf8Mode = re.m_utf8Mode;
242 m_iOptions = re.m_iOptions;
246 const CRegExp& CRegExp::operator=(const CRegExp& re)
250 m_jitCompiled = false;
251 m_pattern = re.m_pattern;
254 if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0)
256 if ((m_re = (pcre*)malloc(size)))
258 memcpy(m_re, re.m_re, size);
259 memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int));
260 m_offset = re.m_offset;
261 m_iMatchCount = re.m_iMatchCount;
262 m_bMatched = re.m_bMatched;
263 m_subject = re.m_subject;
264 m_iOptions = re.m_iOptions;
267 CLog::Log(LOGSEVERE, "%s: Failed to allocate memory", __FUNCTION__);
278 bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
284 m_jitCompiled = false;
287 const char *errMsg = NULL;
289 int options = m_iOptions;
290 if (m_utf8Mode == autoUtf8 && requireUtf8(re))
291 options |= (IsUtf8Supported() ? PCRE_UTF8 : 0) | (AreUnicodePropertiesSupported() ? PCRE_UCP : 0);
295 m_re = pcre_compile(re, options, &errMsg, &errOffset, NULL);
299 CLog::Log(LOGERROR, "PCRE: %s. Compilation failed at offset %d in expression '%s'",
300 errMsg, errOffset, re);
308 const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported();
309 const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0;
311 m_sd = pcre_study(m_re, studyOptions, &errMsg);
314 CLog::Log(LOGWARNING, "%s: PCRE error \"%s\" while studying expression", __FUNCTION__, errMsg);
317 pcre_free_study(m_sd);
324 m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1);
331 int CRegExp::RegFind(const char *str, unsigned int startoffset /*= 0*/, int maxNumberOfCharsToTest /*= -1*/)
333 return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest);
336 int CRegExp::PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset /* = 0*/, int maxNumberOfCharsToTest /*= -1*/)
344 CLog::Log(LOGERROR, "PCRE: Called before compilation");
350 CLog::Log(LOGERROR, "PCRE: Called without a string to match");
354 if (startoffset > bufferLen)
356 CLog::Log(LOGERROR, "%s: startoffset is beyond end of string to match", __FUNCTION__);
360 #ifdef PCRE_HAS_JIT_CODE
361 if (m_jitCompiled && !m_jitStack)
363 m_jitStack = pcre_jit_stack_alloc(32*1024, 512*1024);
364 if (m_jitStack == NULL)
365 CLog::Log(LOGWARNING, "%s: can't allocate address space for JIT stack", __FUNCTION__);
367 pcre_assign_jit_stack(m_sd, NULL, m_jitStack);
371 if (maxNumberOfCharsToTest >= 0)
372 bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest);
374 m_subject.assign(str + startoffset, bufferLen - startoffset);
375 int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT);
379 static const int fragmentLen = 80; // length of excerpt before erroneous char for log
382 case PCRE_ERROR_NOMATCH:
385 case PCRE_ERROR_MATCHLIMIT:
386 CLog::Log(LOGERROR, "PCRE: Match limit reached");
389 #ifdef PCRE_ERROR_SHORTUTF8
390 case PCRE_ERROR_SHORTUTF8:
392 const size_t startPos = (m_subject.length() > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_subject.length() - fragmentLen) : 0;
393 if (startPos != std::string::npos)
394 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string. Text before bad character: \"%s\"", m_subject.substr(startPos).c_str());
396 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character at the end of string");
400 case PCRE_ERROR_BADUTF8:
402 const size_t startPos = (m_iOvector[0] > fragmentLen) ? CUtf8Utils::RFindValidUtf8Char(m_subject, m_iOvector[0] - fragmentLen) : 0;
403 if (m_iOvector[0] >= 0 && startPos != std::string::npos)
404 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d. Text before bad char: \"%s\"", m_iOvector[1], m_iOvector[0], m_subject.substr(startPos, m_iOvector[0] - startPos + 1).c_str());
406 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character, error code: %d, position: %d", m_iOvector[1], m_iOvector[0]);
409 case PCRE_ERROR_BADUTF8_OFFSET:
410 CLog::Log(LOGERROR, "PCRE: Offset is pointing to the middle of UTF-8 character");
414 CLog::Log(LOGERROR, "PCRE: Unknown error: %d", rc);
418 m_offset = startoffset;
421 return m_iOvector[0] + m_offset;
424 int CRegExp::GetCaptureTotal() const
428 pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c);
432 std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const
434 if (!m_bMatched || sReplaceExp.empty())
437 const char* const expr = sReplaceExp.c_str();
439 size_t pos = sReplaceExp.find_first_of("\\&");
440 std::string result(sReplaceExp, 0, pos);
441 result.reserve(sReplaceExp.size()); // very rough estimate
443 while(pos != std::string::npos)
445 if (expr[pos] == '\\')
447 // string is null-terminated and current char isn't null, so it's safe to advance to next char
448 pos++; // advance to next char
449 const char nextChar = expr[pos];
450 if (nextChar == '&' || nextChar == '\\')
451 { // this is "\&" or "\\" combination
452 result.push_back(nextChar); // add '&' or '\' to result
455 else if (isdigit(nextChar))
456 { // this is "\0" - "\9" combination
457 int subNum = nextChar - '0';
458 pos++; // advance to second next char
459 const char secondNextChar = expr[pos];
460 if (isdigit(secondNextChar))
461 { // this is "\00" - "\99" combination
462 subNum = subNum * 10 + (secondNextChar - '0');
465 result.append(GetMatch(subNum));
470 result.append(GetMatch(0));
474 const size_t nextPos = sReplaceExp.find_first_of("\\&", pos);
475 result.append(sReplaceExp, pos, nextPos - pos);
482 int CRegExp::GetSubStart(int iSub) const
484 if (!IsValidSubNumber(iSub))
487 return m_iOvector[iSub*2] + m_offset;
490 int CRegExp::GetSubStart(const std::string& subName) const
492 return GetSubStart(GetNamedSubPatternNumber(subName.c_str()));
495 int CRegExp::GetSubLength(int iSub) const
497 if (!IsValidSubNumber(iSub))
500 return m_iOvector[(iSub*2)+1] - m_iOvector[(iSub*2)];
503 int CRegExp::GetSubLength(const std::string& subName) const
505 return GetSubLength(GetNamedSubPatternNumber(subName.c_str()));
508 std::string CRegExp::GetMatch(int iSub /* = 0 */) const
510 if (!IsValidSubNumber(iSub))
513 int pos = m_iOvector[(iSub*2)];
514 int len = m_iOvector[(iSub*2)+1] - pos;
515 if (pos < 0 || len <= 0)
518 return m_subject.substr(pos, len);
521 std::string CRegExp::GetMatch(const std::string& subName) const
523 return GetMatch(GetNamedSubPatternNumber(subName.c_str()));
526 bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const
529 int iSub = pcre_get_stringnumber(m_re, strName);
530 if (!IsValidSubNumber(iSub))
532 strMatch = GetMatch(iSub);
536 int CRegExp::GetNamedSubPatternNumber(const char* strName) const
538 return pcre_get_stringnumber(m_re, strName);
541 void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */)
543 if (iLog < LOGDEBUG || iLog > LOGNONE)
546 CStdString str = "{";
547 int size = GetSubCount(); // past the subpatterns is junk
548 for (int i = 0; i <= size; i++)
550 CStdString t = StringUtils::Format("[%i,%i]", m_iOvector[(i*2)], m_iOvector[(i*2)+1]);
556 CLog::Log(iLog, "regexp ovector=%s", str.c_str());
559 void CRegExp::Cleanup()
569 pcre_free_study(m_sd);
573 #ifdef PCRE_HAS_JIT_CODE
576 pcre_jit_stack_free(m_jitStack);
582 inline bool CRegExp::IsValidSubNumber(int iSub) const
584 return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences;
588 bool CRegExp::IsUtf8Supported(void)
590 if (m_Utf8Supported == -1)
592 if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0)
596 return m_Utf8Supported == 1;
599 bool CRegExp::AreUnicodePropertiesSupported(void)
601 #if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0
602 if (m_UcpSupported == -1)
604 if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0)
609 return m_UcpSupported == 1;
612 bool CRegExp::LogCheckUtf8Support(void)
614 bool utf8FullSupport = true;
616 if (!CRegExp::IsUtf8Supported())
618 utf8FullSupport = false;
619 CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!");
622 if (!CRegExp::AreUnicodePropertiesSupported())
624 utf8FullSupport = false;
625 CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!");
628 if (!utf8FullSupport)
630 CLog::Log(LOGNOTICE, "Consider installing PCRE lib version 8.10 or later with enabled Unicode properties and UTF-8 support. Your PCRE lib version: %s", PCRE::pcre_version());
632 CLog::Log(LOGNOTICE, "You will need to rebuild XBMC after PCRE lib update.");
636 return utf8FullSupport;
639 bool CRegExp::IsJitSupported(void)
641 if (m_JitSupported == -1)
643 #ifdef PCRE_HAS_JIT_CODE
644 if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0)
649 return m_JitSupported == 1;