2 * Copyright (C) 2005-2013 Team XBMC
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with XBMC; see the file COPYING. If not, see
17 * <http://www.gnu.org/licenses/>.
25 #include "StdString.h"
34 #ifdef PCRE_CONFIG_JIT
35 #define PCRE_HAS_JIT_CODE 1
38 #ifndef PCRE_STUDY_JIT_COMPILE
39 #define PCRE_STUDY_JIT_COMPILE 0
43 #define PCRE_INFO_JIT 2048
45 #ifndef PCRE_HAS_JIT_CODE
46 #define pcre_free_study(x) pcre_free((x))
49 int CRegExp::m_Utf8Supported = -1;
50 int CRegExp::m_UcpSupported = -1;
51 int CRegExp::m_JitSupported = -1;
54 CRegExp::CRegExp(bool caseless /*= false*/, bool utf8 /*= false*/)
56 InitValues(caseless, utf8);
59 void CRegExp::InitValues(bool caseless /*= false*/, bool utf8 /*= false*/)
63 m_iOptions = PCRE_DOTALL | PCRE_NEWLINE_ANY;
65 m_iOptions |= PCRE_CASELESS;
68 if (IsUtf8Supported())
69 m_iOptions |= PCRE_UTF8;
70 if (AreUnicodePropertiesSupported())
71 m_iOptions |= PCRE_UCP;
75 m_jitCompiled = false;
80 memset(m_iOvector, 0, sizeof(m_iOvector));
83 CRegExp::CRegExp(bool caseless, bool utf8, const char *re, studyMode study /*= NoStudy*/)
85 InitValues(caseless, utf8);
89 CRegExp::CRegExp(const CRegExp& re)
94 m_iOptions = re.m_iOptions;
98 const CRegExp& CRegExp::operator=(const CRegExp& re)
102 m_jitCompiled = false;
103 m_pattern = re.m_pattern;
106 if (pcre_fullinfo(re.m_re, NULL, PCRE_INFO_SIZE, &size) >= 0)
108 if ((m_re = (pcre*)malloc(size)))
110 memcpy(m_re, re.m_re, size);
111 memcpy(m_iOvector, re.m_iOvector, OVECCOUNT*sizeof(int));
112 m_offset = re.m_offset;
113 m_iMatchCount = re.m_iMatchCount;
114 m_bMatched = re.m_bMatched;
115 m_subject = re.m_subject;
116 m_iOptions = re.m_iOptions;
119 CLog::Log(LOGSEVERE, "%s: Failed to allocate memory", __FUNCTION__);
130 bool CRegExp::RegComp(const char *re, studyMode study /*= NoStudy*/)
136 m_jitCompiled = false;
139 const char *errMsg = NULL;
144 m_re = pcre_compile(re, m_iOptions, &errMsg, &errOffset, NULL);
148 CLog::Log(LOGERROR, "PCRE: %s. Compilation failed at offset %d in expression '%s'",
149 errMsg, errOffset, re);
157 const bool jitCompile = (study == StudyWithJitComp) && IsJitSupported();
158 const int studyOptions = jitCompile ? PCRE_STUDY_JIT_COMPILE : 0;
160 m_sd = pcre_study(m_re, studyOptions, &errMsg);
163 CLog::Log(LOGWARNING, "%s: PCRE error \"%s\" while studying expression", __FUNCTION__, errMsg);
166 pcre_free_study(m_sd);
173 m_jitCompiled = (pcre_fullinfo(m_re, m_sd, PCRE_INFO_JIT, &jitPresent) == 0 && jitPresent == 1);
180 int CRegExp::RegFind(const char *str, unsigned int startoffset /*= 0*/, int maxNumberOfCharsToTest /*= -1*/)
182 return PrivateRegFind(strlen(str), str, startoffset, maxNumberOfCharsToTest);
185 int CRegExp::PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset /* = 0*/, int maxNumberOfCharsToTest /*= -1*/)
193 CLog::Log(LOGERROR, "PCRE: Called before compilation");
199 CLog::Log(LOGERROR, "PCRE: Called without a string to match");
203 if (startoffset > bufferLen)
205 CLog::Log(LOGERROR, "%s: startoffset is beyond end of string to match", __FUNCTION__);
209 #ifdef PCRE_HAS_JIT_CODE
210 if (m_jitCompiled && !m_jitStack)
212 m_jitStack = pcre_jit_stack_alloc(32*1024, 512*1024);
213 if (m_jitStack == NULL)
214 CLog::Log(LOGWARNING, "%s: can't allocate address space for JIT stack", __FUNCTION__);
216 pcre_assign_jit_stack(m_sd, NULL, m_jitStack);
220 if (maxNumberOfCharsToTest >= 0)
221 bufferLen = std::min<size_t>(bufferLen, startoffset + maxNumberOfCharsToTest);
223 m_subject.assign(str + startoffset, bufferLen - startoffset);
224 int rc = pcre_exec(m_re, NULL, m_subject.c_str(), m_subject.length(), 0, 0, m_iOvector, OVECCOUNT);
230 case PCRE_ERROR_NOMATCH:
233 case PCRE_ERROR_MATCHLIMIT:
234 CLog::Log(LOGERROR, "PCRE: Match limit reached");
237 #ifdef PCRE_ERROR_SHORTUTF8
238 case PCRE_ERROR_SHORTUTF8:
240 case PCRE_ERROR_BADUTF8:
241 CLog::Log(LOGERROR, "PCRE: Bad UTF-8 character");
244 case PCRE_ERROR_BADUTF8_OFFSET:
245 CLog::Log(LOGERROR, "PCRE: Offset (%d) is pointing to the middle of UTF-8 character", startoffset);
249 CLog::Log(LOGERROR, "PCRE: Unknown error: %d", rc);
253 m_offset = startoffset;
256 return m_iOvector[0] + m_offset;
259 int CRegExp::GetCaptureTotal() const
263 pcre_fullinfo(m_re, NULL, PCRE_INFO_CAPTURECOUNT, &c);
267 std::string CRegExp::GetReplaceString(const std::string& sReplaceExp) const
269 if (!m_bMatched || sReplaceExp.empty())
272 const char* const expr = sReplaceExp.c_str();
274 size_t pos = sReplaceExp.find_first_of("\\&");
275 std::string result(sReplaceExp, 0, pos);
276 result.reserve(sReplaceExp.size()); // very rough estimate
278 while(pos != std::string::npos)
280 if (expr[pos] == '\\')
282 // string is null-terminated and current char isn't null, so it's safe to advance to next char
283 pos++; // advance to next char
284 const char nextChar = expr[pos];
285 if (nextChar == '&' || nextChar == '\\')
286 { // this is "\&" or "\\" combination
287 result.push_back(nextChar); // add '&' or '\' to result
290 else if (isdigit(nextChar))
291 { // this is "\0" - "\9" combination
292 int subNum = nextChar - '0';
293 pos++; // advance to second next char
294 const char secondNextChar = expr[pos];
295 if (isdigit(secondNextChar))
296 { // this is "\00" - "\99" combination
297 subNum = subNum * 10 + (secondNextChar - '0');
300 result.append(GetMatch(subNum));
305 result.append(GetMatch(0));
309 const size_t nextPos = sReplaceExp.find_first_of("\\&", pos);
310 result.append(sReplaceExp, pos, nextPos - pos);
317 int CRegExp::GetSubStart(int iSub) const
319 if (!IsValidSubNumber(iSub))
322 return m_iOvector[iSub*2] + m_offset;
325 int CRegExp::GetSubStart(const std::string& subName) const
327 return GetSubStart(GetNamedSubPatternNumber(subName.c_str()));
330 int CRegExp::GetSubLength(int iSub) const
332 if (!IsValidSubNumber(iSub))
335 return m_iOvector[(iSub*2)+1] - m_iOvector[(iSub*2)];
338 int CRegExp::GetSubLength(const std::string& subName) const
340 return GetSubLength(GetNamedSubPatternNumber(subName.c_str()));
343 std::string CRegExp::GetMatch(int iSub /* = 0 */) const
345 if (!IsValidSubNumber(iSub))
348 int pos = m_iOvector[(iSub*2)];
349 int len = m_iOvector[(iSub*2)+1] - pos;
350 if (pos < 0 || len <= 0)
353 return m_subject.substr(pos, len);
356 std::string CRegExp::GetMatch(const std::string& subName) const
358 return GetMatch(GetNamedSubPatternNumber(subName.c_str()));
361 bool CRegExp::GetNamedSubPattern(const char* strName, std::string& strMatch) const
364 int iSub = pcre_get_stringnumber(m_re, strName);
365 if (!IsValidSubNumber(iSub))
367 strMatch = GetMatch(iSub);
371 int CRegExp::GetNamedSubPatternNumber(const char* strName) const
373 return pcre_get_stringnumber(m_re, strName);
376 void CRegExp::DumpOvector(int iLog /* = LOGDEBUG */)
378 if (iLog < LOGDEBUG || iLog > LOGNONE)
381 CStdString str = "{";
382 int size = GetSubCount(); // past the subpatterns is junk
383 for (int i = 0; i <= size; i++)
386 t.Format("[%i,%i]", m_iOvector[(i*2)], m_iOvector[(i*2)+1]);
392 CLog::Log(iLog, "regexp ovector=%s", str.c_str());
395 void CRegExp::Cleanup()
405 pcre_free_study(m_sd);
409 #ifdef PCRE_HAS_JIT_CODE
412 pcre_jit_stack_free(m_jitStack);
418 inline bool CRegExp::IsValidSubNumber(int iSub) const
420 return iSub >= 0 && iSub <= m_iMatchCount && iSub <= m_MaxNumOfBackrefrences;
424 bool CRegExp::IsUtf8Supported(void)
426 if (m_Utf8Supported == -1)
428 if (pcre_config(PCRE_CONFIG_UTF8, &m_Utf8Supported) != 0)
432 return m_Utf8Supported == 1;
435 bool CRegExp::AreUnicodePropertiesSupported(void)
437 #if defined(PCRE_CONFIG_UNICODE_PROPERTIES) && PCRE_UCP != 0
438 if (m_UcpSupported == -1)
440 if (pcre_config(PCRE_CONFIG_UNICODE_PROPERTIES, &m_UcpSupported) != 0)
445 return m_UcpSupported == 1;
448 bool CRegExp::LogCheckUtf8Support(void)
450 bool utf8FullSupport = true;
452 if (!CRegExp::IsUtf8Supported())
454 utf8FullSupport = false;
455 CLog::Log(LOGWARNING, "UTF-8 is not supported in PCRE lib, support for national symbols is limited!");
458 if (!CRegExp::AreUnicodePropertiesSupported())
460 utf8FullSupport = false;
461 CLog::Log(LOGWARNING, "Unicode properties are not enabled in PCRE lib, support for national symbols may be limited!");
464 if (!utf8FullSupport)
466 CLog::Log(LOGNOTICE, "Consider installing PCRE lib version 8.10 or later with enabled Unicode properties and UTF-8 support. Your PCRE lib version: %s", PCRE::pcre_version());
468 CLog::Log(LOGNOTICE, "You will need to rebuild XBMC after PCRE lib update.");
472 return utf8FullSupport;
475 bool CRegExp::IsJitSupported(void)
477 if (m_JitSupported == -1)
479 #ifdef PCRE_HAS_JIT_CODE
480 if (pcre_config(PCRE_CONFIG_JIT, &m_JitSupported) != 0)
485 return m_JitSupported == 1;