2 * Copyright (C) 2005-2013 Team XBMC
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with XBMC; see the file COPYING. If not, see
17 * <http://www.gnu.org/licenses/>.
21 #include "ScraperParser.h"
23 #include "addons/AddonManager.h"
26 #include "addons/Scraper.h"
29 #include "utils/StringUtils.h"
31 #include "CharsetConverter.h"
32 #include "utils/StringUtils.h"
38 using namespace ADDON;
39 using namespace XFILE;
41 CScraperParser::CScraperParser()
43 m_pRootElement = NULL;
45 m_SearchStringEncoding = "UTF-8";
50 CScraperParser::CScraperParser(const CScraperParser& parser)
52 m_pRootElement = NULL;
54 m_SearchStringEncoding = "UTF-8";
60 CScraperParser &CScraperParser::operator=(const CScraperParser &parser)
65 if (parser.m_document)
67 m_scraper = parser.m_scraper;
68 m_document = new CXBMCTinyXML(*parser.m_document);
77 CScraperParser::~CScraperParser()
82 void CScraperParser::Clear()
84 m_pRootElement = NULL;
91 bool CScraperParser::Load(const CStdString& strXMLFile)
95 m_document = new CXBMCTinyXML();
100 m_strFile = strXMLFile;
102 if (m_document->LoadFile(strXMLFile))
103 return LoadFromXML();
110 bool CScraperParser::LoadFromXML()
115 m_pRootElement = m_document->RootElement();
116 CStdString strValue = m_pRootElement->Value();
117 if (strValue == "scraper")
119 TiXmlElement* pChildElement = m_pRootElement->FirstChildElement("CreateSearchUrl");
123 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
124 m_SearchStringEncoding = "UTF-8";
127 pChildElement = m_pRootElement->FirstChildElement("CreateArtistSearchUrl");
131 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
132 m_SearchStringEncoding = "UTF-8";
134 pChildElement = m_pRootElement->FirstChildElement("CreateAlbumSearchUrl");
138 if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
139 m_SearchStringEncoding = "UTF-8";
147 m_pRootElement = NULL;
151 void CScraperParser::ReplaceBuffers(CStdString& strDest)
155 for (int i=MAX_SCRAPER_BUFFERS-1; i>=0; i--)
158 CStdString temp = StringUtils::Format("$$%i",i+1);
159 while ((size_t)(iIndex = strDest.find(temp,iIndex)) != CStdString::npos) // COPIED FROM CStdString WITH THE ADDITION OF $ ESCAPING
161 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+temp.GetLength(),m_param[i]);
162 iIndex += m_param[i].length();
167 while ((size_t)(iIndex = strDest.find("$INFO[",iIndex)) != CStdString::npos)
169 int iEnd = strDest.Find("]",iIndex);
170 CStdString strInfo = strDest.Mid(iIndex+6,iEnd-iIndex-6);
171 CStdString strReplace;
173 strReplace = m_scraper->GetSetting(strInfo);
174 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
175 iIndex += strReplace.length();
177 // insert localize strings
179 while ((size_t)(iIndex = strDest.find("$LOCALIZE[",iIndex)) != CStdString::npos)
181 int iEnd = strDest.Find("]",iIndex);
182 CStdString strInfo = strDest.Mid(iIndex+10,iEnd-iIndex-10);
183 CStdString strReplace;
185 strReplace = m_scraper->GetString(strtol(strInfo.c_str(),NULL,10));
186 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
187 iIndex += strReplace.length();
190 while ((size_t)(iIndex = strDest.find("\\n",iIndex)) != CStdString::npos)
191 strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+2,"\n");
194 void CScraperParser::ParseExpression(const CStdString& input, CStdString& dest, TiXmlElement* element, bool bAppend)
196 CStdString strOutput = element->Attribute("output");
198 TiXmlElement* pExpression = element->FirstChildElement("expression");
201 bool bInsensitive=true;
202 const char* sensitive = pExpression->Attribute("cs");
204 if (stricmp(sensitive,"yes") == 0)
205 bInsensitive=false; // match case sensitive
207 CRegExp reg(bInsensitive, true);
208 CStdString strExpression;
209 if (pExpression->FirstChild())
210 strExpression = pExpression->FirstChild()->Value();
212 strExpression = "(.*)";
213 ReplaceBuffers(strExpression);
214 ReplaceBuffers(strOutput);
216 if (!reg.RegComp(strExpression.c_str()))
221 bool bRepeat = false;
222 const char* szRepeat = pExpression->Attribute("repeat");
224 if (stricmp(szRepeat,"yes") == 0)
227 const char* szClear = pExpression->Attribute("clear");
229 if (stricmp(szClear,"yes") == 0)
230 dest=""; // clear no matter if regexp fails
232 bool bClean[MAX_SCRAPER_BUFFERS];
233 GetBufferParams(bClean,pExpression->Attribute("noclean"),true);
235 bool bTrim[MAX_SCRAPER_BUFFERS];
236 GetBufferParams(bTrim,pExpression->Attribute("trim"),false);
238 bool bFixChars[MAX_SCRAPER_BUFFERS];
239 GetBufferParams(bFixChars,pExpression->Attribute("fixchars"),false);
241 bool bEncode[MAX_SCRAPER_BUFFERS];
242 GetBufferParams(bEncode,pExpression->Attribute("encode"),false);
245 pExpression->QueryIntAttribute("optional",&iOptional);
248 pExpression->QueryIntAttribute("compare",&iCompare);
250 m_param[iCompare-1].ToLower();
251 CStdString curInput = input;
252 for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
255 InsertToken(strOutput,iBuf+1,"!!!CLEAN!!!");
257 InsertToken(strOutput,iBuf+1,"!!!TRIM!!!");
259 InsertToken(strOutput,iBuf+1,"!!!FIXCHARS!!!");
261 InsertToken(strOutput,iBuf+1,"!!!ENCODE!!!");
263 int i = reg.RegFind(curInput.c_str());
264 while (i > -1 && (i < (int)curInput.size() || curInput.size() == 0))
271 CStdString strCurOutput=strOutput;
273 if (iOptional > -1) // check that required param is there
276 sprintf(temp,"\\%i",iOptional);
277 std::string szParam = reg.GetReplaceString(temp);
279 reg2.RegComp("(.*)(\\\\\\(.*\\\\2.*)\\\\\\)(.*)");
280 int i2=reg2.RegFind(strCurOutput.c_str());
283 std::string szRemove(reg2.GetMatch(2));
284 int iRemove = szRemove.size();
285 int i3 = strCurOutput.find(szRemove);
286 if (!szParam.empty())
288 strCurOutput.erase(i3+iRemove,2);
289 strCurOutput.erase(i3,2);
292 strCurOutput.replace(strCurOutput.begin()+i3,strCurOutput.begin()+i3+iRemove+2,"");
294 i2 = reg2.RegFind(strCurOutput.c_str());
298 int iLen = reg.GetFindLen();
299 // nasty hack #1 - & means \0 in a replace string
300 strCurOutput.Replace("&","!!!AMPAMP!!!");
301 std::string result = reg.GetReplaceString(strCurOutput.c_str());
304 CStdString strResult(result);
305 strResult.Replace("!!!AMPAMP!!!","&");
307 ReplaceBuffers(strResult);
310 CStdString strResultNoCase = strResult;
311 strResultNoCase.ToLower();
312 if (strResultNoCase.Find(m_param[iCompare-1]) != -1)
318 if (bRepeat && iLen > 0)
320 curInput.erase(0,i+iLen>(int)curInput.size()?curInput.size():i+iLen);
321 i = reg.RegFind(curInput.c_str());
329 void CScraperParser::ParseNext(TiXmlElement* element)
331 TiXmlElement* pReg = element;
334 TiXmlElement* pChildReg = pReg->FirstChildElement("RegExp");
336 ParseNext(pChildReg);
339 TiXmlElement* pChildReg = pReg->FirstChildElement("clear");
341 ParseNext(pChildReg);
345 bool bAppend = false;
346 const char* szDest = pReg->Attribute("dest");
350 if (szDest[strlen(szDest)-1] == '+')
353 iDest = atoi(szDest);
356 const char *szInput = pReg->Attribute("input");
361 ReplaceBuffers(strInput);
364 strInput = m_param[0];
366 const char* szConditional = pReg->Attribute("conditional");
367 bool bExecute = true;
371 if (szConditional[0] == '!')
376 CStdString strSetting;
377 if (m_scraper && m_scraper->HasSettings())
378 strSetting = m_scraper->GetSetting(szConditional);
379 bExecute = bInverse != strSetting.Equals("true");
384 if (iDest-1 < MAX_SCRAPER_BUFFERS && iDest-1 > -1)
385 ParseExpression(strInput, m_param[iDest-1],pReg,bAppend);
387 CLog::Log(LOGERROR,"CScraperParser::ParseNext: destination buffer "
388 "out of bounds, skipping expression");
391 pReg = pReg->NextSiblingElement("RegExp");
395 const CStdString CScraperParser::Parse(const CStdString& strTag,
398 TiXmlElement* pChildElement = m_pRootElement->FirstChildElement(strTag.c_str());
399 if(pChildElement == NULL)
401 CLog::Log(LOGERROR,"%s: Could not find scraper function %s",__FUNCTION__,strTag.c_str());
404 int iResult = 1; // default to param 1
405 pChildElement->QueryIntAttribute("dest",&iResult);
406 TiXmlElement* pChildStart = pChildElement->FirstChildElement("RegExp");
408 ParseNext(pChildStart);
409 CStdString tmp = m_param[iResult-1];
411 const char* szClearBuffers = pChildElement->Attribute("clearbuffers");
412 if (!szClearBuffers || stricmp(szClearBuffers,"no") != 0)
418 void CScraperParser::Clean(CStdString& strDirty)
421 CStdString strBuffer;
422 while ((i=strDirty.Find("!!!CLEAN!!!",i)) != -1)
425 if ((i2=strDirty.Find("!!!CLEAN!!!",i+11)) != -1)
427 strBuffer = strDirty.substr(i+11,i2-i-11);
428 CStdString strConverted(strBuffer);
429 HTML::CHTMLUtil::RemoveTags(strConverted);
430 StringUtils::Trim(strConverted);
431 strDirty.erase(i,i2-i+11);
432 strDirty.Insert(i,strConverted);
433 i += strConverted.size();
439 while ((i=strDirty.Find("!!!TRIM!!!",i)) != -1)
442 if ((i2=strDirty.Find("!!!TRIM!!!",i+10)) != -1)
444 strBuffer = strDirty.substr(i+10,i2-i-10);
445 StringUtils::Trim(strBuffer);
446 strDirty.erase(i,i2-i+10);
447 strDirty.Insert(i,strBuffer);
448 i += strBuffer.size();
454 while ((i=strDirty.Find("!!!FIXCHARS!!!",i)) != -1)
457 if ((i2=strDirty.Find("!!!FIXCHARS!!!",i+14)) != -1)
459 strBuffer = strDirty.substr(i+14,i2-i-14);
461 g_charsetConverter.toW(strBuffer,wbuffer,GetSearchStringEncoding());
462 CStdStringW wConverted;
463 HTML::CHTMLUtil::ConvertHTMLToW(wbuffer,wConverted);
464 g_charsetConverter.fromW(wConverted,strBuffer,GetSearchStringEncoding());
465 StringUtils::Trim(strBuffer);
466 ConvertJSON(strBuffer);
467 strDirty.erase(i,i2-i+14);
468 strDirty.Insert(i,strBuffer);
469 i += strBuffer.size();
475 while ((i=strDirty.Find("!!!ENCODE!!!",i)) != -1)
478 if ((i2=strDirty.Find("!!!ENCODE!!!",i+12)) != -1)
480 strBuffer = strDirty.substr(i+12,i2-i-12);
481 CURL::Encode(strBuffer);
482 strDirty.erase(i,i2-i+12);
483 strDirty.Insert(i,strBuffer);
484 i += strBuffer.size();
491 void CScraperParser::ConvertJSON(CStdString &string)
494 reg.RegComp("\\\\u([0-f]{4})");
495 while (reg.RegFind(string.c_str()) > -1)
497 int pos = reg.GetSubStart(1);
498 std::string szReplace(reg.GetMatch(1));
500 CStdString replace = StringUtils::Format("&#x%s;", szReplace.c_str());
501 string.replace(string.begin()+pos-2, string.begin()+pos+4, replace);
505 reg2.RegComp("\\\\x([0-9]{2})([^\\\\]+;)");
506 while (reg2.RegFind(string.c_str()) > -1)
508 int pos1 = reg2.GetSubStart(1);
509 int pos2 = reg2.GetSubStart(2);
510 std::string szHexValue(reg2.GetMatch(1));
512 CStdString replace = StringUtils::Format("%c", strtol(szHexValue.c_str(), NULL, 16));
513 string.replace(string.begin()+pos1-2, string.begin()+pos2+reg2.GetSubLength(2), replace);
516 string.Replace("\\\"","\"");
519 void CScraperParser::ClearBuffers()
521 //clear all m_param strings
522 for (int i=0;i<MAX_SCRAPER_BUFFERS;++i)
526 void CScraperParser::GetBufferParams(bool* result, const char* attribute, bool defvalue)
528 for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
529 result[iBuf] = defvalue;;
532 vector<std::string> vecBufs;
533 StringUtils::Tokenize(attribute,vecBufs,",");
534 for (size_t nToken=0; nToken < vecBufs.size(); nToken++)
536 int index = atoi(vecBufs[nToken].c_str())-1;
537 if (index < MAX_SCRAPER_BUFFERS)
538 result[index] = !defvalue;
543 void CScraperParser::InsertToken(CStdString& strOutput, int buf, const char* token)
546 sprintf(temp,"\\%i",buf);
548 while ((i2 = strOutput.Find(temp,i2)) != -1)
550 strOutput.Insert(i2,token);
552 strOutput.Insert(i2+strlen(temp),token);
557 void CScraperParser::AddDocument(const CXBMCTinyXML* doc)
559 const TiXmlNode* node = doc->RootElement()->FirstChild();
562 m_pRootElement->InsertEndChild(*node);
563 node = node->NextSibling();