code.vuplus.com Git - vuplus_xbmc/blob - xbmc/utils/ScraperParser.cpp

   1 /*
   2  *      Copyright (C) 2005-2013 Team XBMC
   3  *      http://xbmc.org
   4  *
   5  *  This Program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2, or (at your option)
   8  *  any later version.
   9  *
  10  *  This Program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with XBMC; see the file COPYING.  If not, see
  17  *  <http://www.gnu.org/licenses/>.
  18  *
  19  */
  20
  21 #include "ScraperParser.h"
  22
  23 #include "addons/AddonManager.h"
  24 #include "RegExp.h"
  25 #include "HTMLUtil.h"
  26 #include "addons/Scraper.h"
  27 #include "URL.h"
  28 #include "Util.h"
  29 #include "utils/StringUtils.h"
  30 #include "log.h"
  31 #include "CharsetConverter.h"
  32 #include "utils/StringUtils.h"
  33
  34 #include <sstream>
  35 #include <cstring>
  36
  37 using namespace std;
  38 using namespace ADDON;
  39 using namespace XFILE;
  40
  41 CScraperParser::CScraperParser()
  42 {
  43   m_pRootElement = NULL;
  44   m_document = NULL;
  45   m_SearchStringEncoding = "UTF-8";
  46   m_scraper = NULL;
  47   m_isNoop = true;
  48 }
  49
  50 CScraperParser::CScraperParser(const CScraperParser& parser)
  51 {
  52   m_pRootElement = NULL;
  53   m_document = NULL;
  54   m_SearchStringEncoding = "UTF-8";
  55   m_scraper = NULL;
  56   m_isNoop = true;
  57   *this = parser;
  58 }
  59
  60 CScraperParser &CScraperParser::operator=(const CScraperParser &parser)
  61 {
  62   if (this != &parser)
  63   {
  64     Clear();
  65     if (parser.m_document)
  66     {
  67       m_scraper = parser.m_scraper;
  68       m_document = new CXBMCTinyXML(*parser.m_document);
  69       LoadFromXML();
  70     }
  71     else
  72       m_scraper = NULL;
  73   }
  74   return *this;
  75 }
  76
  77 CScraperParser::~CScraperParser()
  78 {
  79   Clear();
  80 }
  81
  82 void CScraperParser::Clear()
  83 {
  84   m_pRootElement = NULL;
  85   delete m_document;
  86
  87   m_document = NULL;
  88   m_strFile.clear();
  89 }
  90
  91 bool CScraperParser::Load(const CStdString& strXMLFile)
  92 {
  93   Clear();
  94
  95   m_document = new CXBMCTinyXML();
  96
  97   if (!m_document)
  98     return false;
  99
 100   m_strFile = strXMLFile;
 101
 102   if (m_document->LoadFile(strXMLFile))
 103     return LoadFromXML();
 104
 105   delete m_document;
 106   m_document = NULL;
 107   return false;
 108 }
 109
 110 bool CScraperParser::LoadFromXML()
 111 {
 112   if (!m_document)
 113     return false;
 114
 115   m_pRootElement = m_document->RootElement();
 116   CStdString strValue = m_pRootElement->Value();
 117   if (strValue == "scraper")
 118   {
 119     TiXmlElement* pChildElement = m_pRootElement->FirstChildElement("CreateSearchUrl");
 120     if (pChildElement)
 121     {
 122       m_isNoop = false;
 123       if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
 124         m_SearchStringEncoding = "UTF-8";
 125     }
 126
 127     pChildElement = m_pRootElement->FirstChildElement("CreateArtistSearchUrl");
 128     if (pChildElement)
 129     {
 130       m_isNoop = false;
 131       if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
 132         m_SearchStringEncoding = "UTF-8";
 133     }
 134     pChildElement = m_pRootElement->FirstChildElement("CreateAlbumSearchUrl");
 135     if (pChildElement)
 136     {
 137       m_isNoop = false;
 138       if (!(m_SearchStringEncoding = pChildElement->Attribute("SearchStringEncoding")))
 139         m_SearchStringEncoding = "UTF-8";
 140     }
 141
 142     return true;
 143   }
 144
 145   delete m_document;
 146   m_document = NULL;
 147   m_pRootElement = NULL;
 148   return false;
 149 }
 150
 151 void CScraperParser::ReplaceBuffers(CStdString& strDest)
 152 {
 153   // insert buffers
 154   int iIndex;
 155   for (int i=MAX_SCRAPER_BUFFERS-1; i>=0; i--)
 156   {
 157     iIndex = 0;
 158     CStdString temp = StringUtils::Format("$$%i",i+1);
 159     while ((size_t)(iIndex = strDest.find(temp,iIndex)) != CStdString::npos) // COPIED FROM CStdString WITH THE ADDITION OF $ ESCAPING
 160     {
 161       strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+temp.GetLength(),m_param[i]);
 162       iIndex += m_param[i].length();
 163     }
 164   }
 165   // insert settings
 166   iIndex = 0;
 167   while ((size_t)(iIndex = strDest.find("$INFO[",iIndex)) != CStdString::npos)
 168   {
 169     int iEnd = strDest.Find("]",iIndex);
 170     CStdString strInfo = strDest.Mid(iIndex+6,iEnd-iIndex-6);
 171     CStdString strReplace;
 172     if (m_scraper)
 173       strReplace = m_scraper->GetSetting(strInfo);
 174     strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
 175     iIndex += strReplace.length();
 176   }
 177   // insert localize strings
 178   iIndex = 0;
 179   while ((size_t)(iIndex = strDest.find("$LOCALIZE[",iIndex)) != CStdString::npos)
 180   {
 181     int iEnd = strDest.Find("]",iIndex);
 182     CStdString strInfo = strDest.Mid(iIndex+10,iEnd-iIndex-10);
 183     CStdString strReplace;
 184     if (m_scraper)
 185       strReplace = m_scraper->GetString(strtol(strInfo.c_str(),NULL,10));
 186     strDest.replace(strDest.begin()+iIndex,strDest.begin()+iEnd+1,strReplace);
 187     iIndex += strReplace.length();
 188   }
 189   iIndex = 0;
 190   while ((size_t)(iIndex = strDest.find("\\n",iIndex)) != CStdString::npos)
 191     strDest.replace(strDest.begin()+iIndex,strDest.begin()+iIndex+2,"\n");
 192 }
 193
 194 void CScraperParser::ParseExpression(const CStdString& input, CStdString& dest, TiXmlElement* element, bool bAppend)
 195 {
 196   CStdString strOutput = element->Attribute("output");
 197
 198   TiXmlElement* pExpression = element->FirstChildElement("expression");
 199   if (pExpression)
 200   {
 201     bool bInsensitive=true;
 202     const char* sensitive = pExpression->Attribute("cs");
 203     if (sensitive)
 204       if (stricmp(sensitive,"yes") == 0)
 205         bInsensitive=false; // match case sensitive
 206
 207     CRegExp reg(bInsensitive, true);
 208     CStdString strExpression;
 209     if (pExpression->FirstChild())
 210       strExpression = pExpression->FirstChild()->Value();
 211     else
 212       strExpression = "(.*)";
 213     ReplaceBuffers(strExpression);
 214     ReplaceBuffers(strOutput);
 215
 216     if (!reg.RegComp(strExpression.c_str()))
 217     {
 218       return;
 219     }
 220
 221     bool bRepeat = false;
 222     const char* szRepeat = pExpression->Attribute("repeat");
 223     if (szRepeat)
 224       if (stricmp(szRepeat,"yes") == 0)
 225         bRepeat = true;
 226
 227     const char* szClear = pExpression->Attribute("clear");
 228     if (szClear)
 229       if (stricmp(szClear,"yes") == 0)
 230         dest=""; // clear no matter if regexp fails
 231
 232     bool bClean[MAX_SCRAPER_BUFFERS];
 233     GetBufferParams(bClean,pExpression->Attribute("noclean"),true);
 234
 235     bool bTrim[MAX_SCRAPER_BUFFERS];
 236     GetBufferParams(bTrim,pExpression->Attribute("trim"),false);
 237
 238     bool bFixChars[MAX_SCRAPER_BUFFERS];
 239     GetBufferParams(bFixChars,pExpression->Attribute("fixchars"),false);
 240
 241     bool bEncode[MAX_SCRAPER_BUFFERS];
 242     GetBufferParams(bEncode,pExpression->Attribute("encode"),false);
 243
 244     int iOptional = -1;
 245     pExpression->QueryIntAttribute("optional",&iOptional);
 246
 247     int iCompare = -1;
 248     pExpression->QueryIntAttribute("compare",&iCompare);
 249     if (iCompare > -1)
 250       m_param[iCompare-1].ToLower();
 251     CStdString curInput = input;
 252     for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
 253     {
 254       if (bClean[iBuf])
 255         InsertToken(strOutput,iBuf+1,"!!!CLEAN!!!");
 256       if (bTrim[iBuf])
 257         InsertToken(strOutput,iBuf+1,"!!!TRIM!!!");
 258       if (bFixChars[iBuf])
 259         InsertToken(strOutput,iBuf+1,"!!!FIXCHARS!!!");
 260       if (bEncode[iBuf])
 261         InsertToken(strOutput,iBuf+1,"!!!ENCODE!!!");
 262     }
 263     int i = reg.RegFind(curInput.c_str());
 264     while (i > -1 && (i < (int)curInput.size() || curInput.size() == 0))
 265     {
 266       if (!bAppend)
 267       {
 268         dest = "";
 269         bAppend = true;
 270       }
 271       CStdString strCurOutput=strOutput;
 272
 273       if (iOptional > -1) // check that required param is there
 274       {
 275         char temp[4];
 276         sprintf(temp,"\\%i",iOptional);
 277         std::string szParam = reg.GetReplaceString(temp);
 278         CRegExp reg2;
 279         reg2.RegComp("(.*)(\\\\\\(.*\\\\2.*)\\\\\\)(.*)");
 280         int i2=reg2.RegFind(strCurOutput.c_str());
 281         while (i2 > -1)
 282         {
 283           std::string szRemove(reg2.GetMatch(2));
 284           int iRemove = szRemove.size();
 285           int i3 = strCurOutput.find(szRemove);
 286           if (!szParam.empty())
 287           {
 288             strCurOutput.erase(i3+iRemove,2);
 289             strCurOutput.erase(i3,2);
 290           }
 291           else
 292             strCurOutput.replace(strCurOutput.begin()+i3,strCurOutput.begin()+i3+iRemove+2,"");
 293
 294           i2 = reg2.RegFind(strCurOutput.c_str());
 295         }
 296       }
 297
 298       int iLen = reg.GetFindLen();
 299       // nasty hack #1 - & means \0 in a replace string
 300       strCurOutput.Replace("&","!!!AMPAMP!!!");
 301       std::string result = reg.GetReplaceString(strCurOutput.c_str());
 302       if (!result.empty())
 303       {
 304         CStdString strResult(result);
 305         strResult.Replace("!!!AMPAMP!!!","&");
 306         Clean(strResult);
 307         ReplaceBuffers(strResult);
 308         if (iCompare > -1)
 309         {
 310           CStdString strResultNoCase = strResult;
 311           strResultNoCase.ToLower();
 312           if (strResultNoCase.Find(m_param[iCompare-1]) != -1)
 313             dest += strResult;
 314         }
 315         else
 316           dest += strResult;
 317       }
 318       if (bRepeat && iLen > 0)
 319       {
 320         curInput.erase(0,i+iLen>(int)curInput.size()?curInput.size():i+iLen);
 321         i = reg.RegFind(curInput.c_str());
 322       }
 323       else
 324         i = -1;
 325     }
 326   }
 327 }
 328
 329 void CScraperParser::ParseNext(TiXmlElement* element)
 330 {
 331   TiXmlElement* pReg = element;
 332   while (pReg)
 333   {
 334     TiXmlElement* pChildReg = pReg->FirstChildElement("RegExp");
 335     if (pChildReg)
 336       ParseNext(pChildReg);
 337     else
 338     {
 339       TiXmlElement* pChildReg = pReg->FirstChildElement("clear");
 340       if (pChildReg)
 341         ParseNext(pChildReg);
 342     }
 343
 344     int iDest = 1;
 345     bool bAppend = false;
 346     const char* szDest = pReg->Attribute("dest");
 347     if (szDest)
 348       if (strlen(szDest))
 349       {
 350         if (szDest[strlen(szDest)-1] == '+')
 351           bAppend = true;
 352
 353         iDest = atoi(szDest);
 354       }
 355
 356       const char *szInput = pReg->Attribute("input");
 357       CStdString strInput;
 358       if (szInput)
 359       {
 360         strInput = szInput;
 361         ReplaceBuffers(strInput);
 362       }
 363       else
 364         strInput = m_param[0];
 365
 366       const char* szConditional = pReg->Attribute("conditional");
 367       bool bExecute = true;
 368       if (szConditional)
 369       {
 370         bool bInverse=false;
 371         if (szConditional[0] == '!')
 372         {
 373           bInverse = true;
 374           szConditional++;
 375         }
 376         CStdString strSetting;
 377         if (m_scraper && m_scraper->HasSettings())
 378            strSetting = m_scraper->GetSetting(szConditional);
 379         bExecute = bInverse != strSetting.Equals("true");
 380       }
 381
 382       if (bExecute)
 383       {
 384         if (iDest-1 < MAX_SCRAPER_BUFFERS && iDest-1 > -1)
 385           ParseExpression(strInput, m_param[iDest-1],pReg,bAppend);
 386         else
 387           CLog::Log(LOGERROR,"CScraperParser::ParseNext: destination buffer "
 388                              "out of bounds, skipping expression");
 389       }
 390
 391       pReg = pReg->NextSiblingElement("RegExp");
 392   }
 393 }
 394
 395 const CStdString CScraperParser::Parse(const CStdString& strTag,
 396                                        CScraper* scraper)
 397 {
 398   TiXmlElement* pChildElement = m_pRootElement->FirstChildElement(strTag.c_str());
 399   if(pChildElement == NULL)
 400   {
 401     CLog::Log(LOGERROR,"%s: Could not find scraper function %s",__FUNCTION__,strTag.c_str());
 402     return "";
 403   }
 404   int iResult = 1; // default to param 1
 405   pChildElement->QueryIntAttribute("dest",&iResult);
 406   TiXmlElement* pChildStart = pChildElement->FirstChildElement("RegExp");
 407   m_scraper = scraper;
 408   ParseNext(pChildStart);
 409   CStdString tmp = m_param[iResult-1];
 410
 411   const char* szClearBuffers = pChildElement->Attribute("clearbuffers");
 412   if (!szClearBuffers || stricmp(szClearBuffers,"no") != 0)
 413     ClearBuffers();
 414
 415   return tmp;
 416 }
 417
 418 void CScraperParser::Clean(CStdString& strDirty)
 419 {
 420   int i=0;
 421   CStdString strBuffer;
 422   while ((i=strDirty.Find("!!!CLEAN!!!",i)) != -1)
 423   {
 424     int i2;
 425     if ((i2=strDirty.Find("!!!CLEAN!!!",i+11)) != -1)
 426     {
 427       strBuffer = strDirty.substr(i+11,i2-i-11);
 428       CStdString strConverted(strBuffer);
 429       HTML::CHTMLUtil::RemoveTags(strConverted);
 430       StringUtils::Trim(strConverted);
 431       strDirty.erase(i,i2-i+11);
 432       strDirty.Insert(i,strConverted);
 433       i += strConverted.size();
 434     }
 435     else
 436       break;
 437   }
 438   i=0;
 439   while ((i=strDirty.Find("!!!TRIM!!!",i)) != -1)
 440   {
 441     int i2;
 442     if ((i2=strDirty.Find("!!!TRIM!!!",i+10)) != -1)
 443     {
 444       strBuffer = strDirty.substr(i+10,i2-i-10);
 445       StringUtils::Trim(strBuffer);
 446       strDirty.erase(i,i2-i+10);
 447       strDirty.Insert(i,strBuffer);
 448       i += strBuffer.size();
 449     }
 450     else
 451       break;
 452   }
 453   i=0;
 454   while ((i=strDirty.Find("!!!FIXCHARS!!!",i)) != -1)
 455   {
 456     int i2;
 457     if ((i2=strDirty.Find("!!!FIXCHARS!!!",i+14)) != -1)
 458     {
 459       strBuffer = strDirty.substr(i+14,i2-i-14);
 460       CStdStringW wbuffer;
 461       g_charsetConverter.toW(strBuffer,wbuffer,GetSearchStringEncoding());
 462       CStdStringW wConverted;
 463       HTML::CHTMLUtil::ConvertHTMLToW(wbuffer,wConverted);
 464       g_charsetConverter.fromW(wConverted,strBuffer,GetSearchStringEncoding());
 465       StringUtils::Trim(strBuffer);
 466       ConvertJSON(strBuffer);
 467       strDirty.erase(i,i2-i+14);
 468       strDirty.Insert(i,strBuffer);
 469       i += strBuffer.size();
 470     }
 471     else
 472       break;
 473   }
 474   i=0;
 475   while ((i=strDirty.Find("!!!ENCODE!!!",i)) != -1)
 476   {
 477     int i2;
 478     if ((i2=strDirty.Find("!!!ENCODE!!!",i+12)) != -1)
 479     {
 480       strBuffer = strDirty.substr(i+12,i2-i-12);
 481       CURL::Encode(strBuffer);
 482       strDirty.erase(i,i2-i+12);
 483       strDirty.Insert(i,strBuffer);
 484       i += strBuffer.size();
 485     }
 486     else
 487       break;
 488   }
 489 }
 490
 491 void CScraperParser::ConvertJSON(CStdString &string)
 492 {
 493   CRegExp reg;
 494   reg.RegComp("\\\\u([0-f]{4})");
 495   while (reg.RegFind(string.c_str()) > -1)
 496   {
 497     int pos = reg.GetSubStart(1);
 498     std::string szReplace(reg.GetMatch(1));
 499
 500     CStdString replace = StringUtils::Format("&#x%s;", szReplace.c_str());
 501     string.replace(string.begin()+pos-2, string.begin()+pos+4, replace);
 502   }
 503
 504   CRegExp reg2;
 505   reg2.RegComp("\\\\x([0-9]{2})([^\\\\]+;)");
 506   while (reg2.RegFind(string.c_str()) > -1)
 507   {
 508     int pos1 = reg2.GetSubStart(1);
 509     int pos2 = reg2.GetSubStart(2);
 510     std::string szHexValue(reg2.GetMatch(1));
 511
 512     CStdString replace = StringUtils::Format("%c", strtol(szHexValue.c_str(), NULL, 16));
 513     string.replace(string.begin()+pos1-2, string.begin()+pos2+reg2.GetSubLength(2), replace);
 514   }
 515
 516   string.Replace("\\\"","\"");
 517 }
 518
 519 void CScraperParser::ClearBuffers()
 520 {
 521   //clear all m_param strings
 522   for (int i=0;i<MAX_SCRAPER_BUFFERS;++i)
 523     m_param[i].clear();
 524 }
 525
 526 void CScraperParser::GetBufferParams(bool* result, const char* attribute, bool defvalue)
 527 {
 528   for (int iBuf=0;iBuf<MAX_SCRAPER_BUFFERS;++iBuf)
 529     result[iBuf] = defvalue;;
 530   if (attribute)
 531   {
 532     vector<std::string> vecBufs;
 533     StringUtils::Tokenize(attribute,vecBufs,",");
 534     for (size_t nToken=0; nToken < vecBufs.size(); nToken++)
 535     {
 536       int index = atoi(vecBufs[nToken].c_str())-1;
 537       if (index < MAX_SCRAPER_BUFFERS)
 538         result[index] = !defvalue;
 539     }
 540   }
 541 }
 542
 543 void CScraperParser::InsertToken(CStdString& strOutput, int buf, const char* token)
 544 {
 545   char temp[4];
 546   sprintf(temp,"\\%i",buf);
 547   int i2=0;
 548   while ((i2 = strOutput.Find(temp,i2)) != -1)
 549   {
 550     strOutput.Insert(i2,token);
 551     i2 += strlen(token);
 552     strOutput.Insert(i2+strlen(temp),token);
 553     i2 += strlen(temp);
 554   }
 555 }
 556
 557 void CScraperParser::AddDocument(const CXBMCTinyXML* doc)
 558 {
 559   const TiXmlNode* node = doc->RootElement()->FirstChild();
 560   while (node)
 561   {
 562     m_pRootElement->InsertEndChild(*node);
 563     node = node->NextSibling();
 564   }
 565 }
 566