2 * Copyright (C) 2005-2013 Team XBMC
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with XBMC; see the file COPYING. If not, see
17 * <http://www.gnu.org/licenses/>.
25 #include "StdString.h"
27 #include "utils/StringUtils.h"
35 #ifdef PCRE_CONFIG_JIT
36 #define PCRE_HAS_JIT_CODE 1
39 #ifndef PCRE_STUDY_JIT_COMPILE
40 #define PCRE_STUDY_JIT_COMPILE 0
44 #define PCRE_INFO_JIT 2048
46 #ifndef PCRE_HAS_JIT_CODE
47 #define pcre_free_study(x) pcre_free((x))
50 int CRegExp::m_Utf8Supported = -1;
51 int CRegExp::m_UcpSupported = -1;
52 int CRegExp::m_JitSupported = -1;
55 CRegExp::CRegExp(bool caseless /*= false*/, bool utf8 /*= false*/)
57 InitValues(caseless, utf8);
60 void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/)
64 m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY;
66 m_iOptions |= PCRE_CASELESS;
69 if (IsUtf8Supported())
70 m_iOptions |= PCRE_UTF8;
71 if (AreUnicodePropertiesSupported())
72 m_iOptions |= PCRE_UCP;
76 m_jitCompiled = false;
81 memset(m_iOvector, 0, sizeof(m_iOvector));
84 CRegExp::CRegExp(bool caseless, bool utf8, const char *re, studyMode study /*= NoStudy*/)
86 InitValues(caseless, utf8);
90 CRegExp::CRegExp(const CRegExp& re)
95 m_iOptions = re.m_iOptions;
99 const CRegExp& CRegExp::operator=(const CRegExp& re)
103 m_jitCompiled = false;
104 m_pattern = re.m_pattern;
107 if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0)
109 if ((m_re = (pcre*)malloc(size)))
111 memcpy(m_re, re.m_re, size);
112 memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int));
113 m_offset = re.m_offset;
114 m_iMatchCount = re.m_iMatchCount;
115 m_bMatched = re.m_bMatched;
116 m_subject = re.m_subject;
117 m_iOptions = re.m_iOptions;
120 CLog::Log(LOGSEVERE, "%s: Failed to allocate memory", __FUNCTION__);
131 bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
137 m_jitCompiled = false;
140 const char *errMsg = NULL;
145 m_re = pcre_compile(re, m_iOptions, &errMsg, &errOffset, NULL);
149 CLog::Log(LOGERROR, "PCRE: %s. Compilation failed at offset %d in expression '%s'",
150 errMsg, errOffset, re);
158 const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported();
159 const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0;
161 m_sd = pcre_study(m_re, studyOptions, &errMsg);
164 CLog::Log(LOGWARNING, "%s: PCRE error \"%s\" while studying expression", __FUNCTION__, errMsg);
167 pcre_free_study(m_sd);
174 m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1);
181 int CRegExp::RegFind(const char *str, unsigned int startoffset /*= 0*/, int maxNumberOfCharsToTest /*= -1*/)
183 return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest);
186 int CRegExp::PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset /* = 0*/, int maxNumberOfCharsToTest /*= -1*/)
194 CLog::Log(LOGERROR, "PCRE: Called before compilation");
200 CLog::Log(LOGERROR, "PCRE: Called without a string to match");
204 if (startoffset > bufferLen)
206 CLog::Log(LOGERROR, "%s: startoffset is beyond end of string to match", __FUNCTION__);
210 #ifdef PCRE_HAS_JIT_CODE
211 if (m_jitCompiled && !m_jitStack)
213 m_jitStack = pcre_jit_stack_alloc(32*1024, 512*1024);
214 if (m_jitStack == NULL)
215 CLog::Log(LOGWARNING, "%s: can't allocate address space for JIT stack", __FUNCTION__);
217 pcre_assign_jit_stack(m_sd, NULL, m_jitStack);
221 if (maxNumberOfCharsToTest >= 0)
222 bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest);
224 m_subject.assign(str + startoffset, bufferLen - startoffset);
225 int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT);
231 case PCRE_ERROR_NOMATCH:
234 case PCRE_ERROR_MATCHLIMIT:
235 CLog::Log(LOGERROR, "PCRE: Match limit reached");
238 #ifdef PCRE_ERROR_SHORTUTF8
239 case PCRE_ERROR_SHORTUTF8:
241 case PCRE_ERROR_BADUTF8:
242 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character");
245 case PCRE_ERROR_BADUTF8_OFFSET:
246 CLog::Log(LOGERROR, "PCRE: Offset (%d) is pointing to the middle of UTF-8 character", startoffset);
250 CLog::Log(LOGERROR, "PCRE: Unknown error: %d", rc);
254 m_offset = startoffset;
257 return m_iOvector[0] + m_offset;
260 int CRegExp::GetCaptureTotal() const
264 pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c);
268 std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const
270 if (!m_bMatched || sReplaceExp.empty())
273 const char* const expr = sReplaceExp.c_str();
275 size_t pos = sReplaceExp.find_first_of("\\&");
276 std::string result(sReplaceExp, 0, pos);
277 result.reserve(sReplaceExp.size()); // very rough estimate
279 while(pos != std::string::npos)
281 if (expr[pos] == '\\')
283 // string is null-terminated and current char isn't null, so it's safe to advance to next char
284 pos++; // advance to next char
285 const char nextChar = expr[pos];
286 if (nextChar == '&' || nextChar == '\\')
287 { // this is "\&" or "\\" combination
288 result.push_back(nextChar); // add '&' or '\' to result
291 else if (isdigit(nextChar))
292 { // this is "\0" - "\9" combination
293 int subNum = nextChar - '0';
294 pos++; // advance to second next char
295 const char secondNextChar = expr[pos];
296 if (isdigit(secondNextChar))
297 { // this is "\00" - "\99" combination
298 subNum = subNum * 10 + (secondNextChar - '0');
301 result.append(GetMatch(subNum));
306 result.append(GetMatch(0));
310 const size_t nextPos = sReplaceExp.find_first_of("\\&", pos);
311 result.append(sReplaceExp, pos, nextPos - pos);
318 int CRegExp::GetSubStart(int iSub) const
320 if (!IsValidSubNumber(iSub))
323 return m_iOvector[iSub*2] + m_offset;
326 int CRegExp::GetSubStart(const std::string& subName) const
328 return GetSubStart(GetNamedSubPatternNumber(subName.c_str()));
331 int CRegExp::GetSubLength(int iSub) const
333 if (!IsValidSubNumber(iSub))
336 return m_iOvector[(iSub*2)+1] - m_iOvector[(iSub*2)];
339 int CRegExp::GetSubLength(const std::string& subName) const
341 return GetSubLength(GetNamedSubPatternNumber(subName.c_str()));
344 std::string CRegExp::GetMatch(int iSub /* = 0 */) const
346 if (!IsValidSubNumber(iSub))
349 int pos = m_iOvector[(iSub*2)];
350 int len = m_iOvector[(iSub*2)+1] - pos;
351 if (pos < 0 || len <= 0)
354 return m_subject.substr(pos, len);
357 std::string CRegExp::GetMatch(const std::string& subName) const
359 return GetMatch(GetNamedSubPatternNumber(subName.c_str()));
362 bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const
365 int iSub = pcre_get_stringnumber(m_re, strName);
366 if (!IsValidSubNumber(iSub))
368 strMatch = GetMatch(iSub);
372 int CRegExp::GetNamedSubPatternNumber(const char* strName) const
374 return pcre_get_stringnumber(m_re, strName);
377 void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */)
379 if (iLog < LOGDEBUG || iLog > LOGNONE)
382 CStdString str = "{";
383 int size = GetSubCount(); // past the subpatterns is junk
384 for (int i = 0; i <= size; i++)
386 CStdString t = StringUtils::Format("[%i,%i]", m_iOvector[(i*2)], m_iOvector[(i*2)+1]);
392 CLog::Log(iLog, "regexp ovector=%s", str.c_str());
395 void CRegExp::Cleanup()
405 pcre_free_study(m_sd);
409 #ifdef PCRE_HAS_JIT_CODE
412 pcre_jit_stack_free(m_jitStack);
418 inline bool CRegExp::IsValidSubNumber(int iSub) const
420 return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences;
424 bool CRegExp::IsUtf8Supported(void)
426 if (m_Utf8Supported == -1)
428 if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0)
432 return m_Utf8Supported == 1;
435 bool CRegExp::AreUnicodePropertiesSupported(void)
437 #if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0
438 if (m_UcpSupported == -1)
440 if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0)
445 return m_UcpSupported == 1;
448 bool CRegExp::LogCheckUtf8Support(void)
450 bool utf8FullSupport = true;
452 if (!CRegExp::IsUtf8Supported())
454 utf8FullSupport = false;
455 CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!");
458 if (!CRegExp::AreUnicodePropertiesSupported())
460 utf8FullSupport = false;
461 CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!");
464 if (!utf8FullSupport)
466 CLog::Log(LOGNOTICE, "Consider installing PCRE lib version 8.10 or later with enabled Unicode properties and UTF-8 support. Your PCRE lib version: %s", PCRE::pcre_version());
468 CLog::Log(LOGNOTICE, "You will need to rebuild XBMC after PCRE lib update.");
472 return utf8FullSupport;
475 bool CRegExp::IsJitSupported(void)
477 if (m_JitSupported == -1)
479 #ifdef PCRE_HAS_JIT_CODE
480 if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0)
485 return m_JitSupported == 1;