code.vuplus.com Git - vuplus_xbmc/blob - xbmc/addons/Scraper.cpp

   1 /*
   2 *      Copyright (C) 2005-2013 Team XBMC
   3 *      http://xbmc.org
   4 *
   5 *  This Program is free software; you can redistribute it and/or modify
   6 *  it under the terms of the GNU General Public License as published by
   7 *  the Free Software Foundation; either version 2, or (at your option)
   8 *  any later version.
   9 *
  10 *  This Program is distributed in the hope that it will be useful,
  11 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13 *  GNU General Public License for more details.
  14 *
  15 *  You should have received a copy of the GNU General Public License
  16 *  along with XBMC; see the file COPYING.  If not, see
  17 *  <http://www.gnu.org/licenses/>.
  18 *
  19 */
  20 #include "Scraper.h"
  21 #include "filesystem/File.h"
  22 #include "filesystem/Directory.h"
  23 #include "filesystem/CurlFile.h"
  24 #include "AddonManager.h"
  25 #include "utils/ScraperParser.h"
  26 #include "utils/ScraperUrl.h"
  27 #include "utils/CharsetConverter.h"
  28 #include "utils/log.h"
  29 #include "music/infoscanner/MusicAlbumInfo.h"
  30 #include "music/infoscanner/MusicArtistInfo.h"
  31 #include "utils/fstrcmp.h"
  32 #include "settings/AdvancedSettings.h"
  33 #include "FileItem.h"
  34 #include "utils/URIUtils.h"
  35 #include "utils/XMLUtils.h"
  36 #include "utils/StringUtils.h"
  37 #include "music/MusicDatabase.h"
  38 #include "video/VideoDatabase.h"
  39 #include "music/Album.h"
  40 #include "music/Artist.h"
  41 #include "Util.h"
  42 #include "URL.h"
  43
  44 #include <sstream>
  45
  46 using namespace std;
  47 using namespace XFILE;
  48 using namespace MUSIC_GRABBER;
  49 using namespace VIDEO;
  50
  51 namespace ADDON
  52 {
  53
  54 typedef struct
  55 {
  56   const char*  name;
  57   CONTENT_TYPE type;
  58   int          pretty;
  59 } ContentMapping;
  60
  61 static const ContentMapping content[] =
  62   {{"unknown",       CONTENT_NONE,          231 },
  63    {"albums",        CONTENT_ALBUMS,        132 },
  64    {"music",         CONTENT_ALBUMS,        132 },
  65    {"artists",       CONTENT_ARTISTS,       133 },
  66    {"movies",        CONTENT_MOVIES,      20342 },
  67    {"tvshows",       CONTENT_TVSHOWS,     20343 },
  68    {"musicvideos",   CONTENT_MUSICVIDEOS, 20389 }};
  69
  70 CStdString TranslateContent(const CONTENT_TYPE &type, bool pretty/*=false*/)
  71 {
  72   for (unsigned int index=0; index < sizeof(content)/sizeof(content[0]); ++index)
  73   {
  74     const ContentMapping &map = content[index];
  75     if (type == map.type)
  76     {
  77       if (pretty && map.pretty)
  78         return g_localizeStrings.Get(map.pretty);
  79       else
  80         return map.name;
  81     }
  82   }
  83   return "";
  84 }
  85
  86 CONTENT_TYPE TranslateContent(const CStdString &string)
  87 {
  88   for (unsigned int index=0; index < sizeof(content)/sizeof(content[0]); ++index)
  89   {
  90     const ContentMapping &map = content[index];
  91     if (string.Equals(map.name))
  92       return map.type;
  93   }
  94   return CONTENT_NONE;
  95 }
  96
  97 TYPE ScraperTypeFromContent(const CONTENT_TYPE &content)
  98 {
  99   switch (content)
 100   {
 101   case CONTENT_ALBUMS:
 102     return ADDON_SCRAPER_ALBUMS;
 103   case CONTENT_ARTISTS:
 104     return ADDON_SCRAPER_ARTISTS;
 105   case CONTENT_MOVIES:
 106     return ADDON_SCRAPER_MOVIES;
 107   case CONTENT_MUSICVIDEOS:
 108     return ADDON_SCRAPER_MUSICVIDEOS;
 109   case CONTENT_TVSHOWS:
 110     return ADDON_SCRAPER_TVSHOWS;
 111   default:
 112     return ADDON_UNKNOWN;
 113   }
 114 }
 115
 116 // if the XML root is <error>, throw CScraperError with enclosed <title>/<message> values
 117 static void CheckScraperError(const TiXmlElement *pxeRoot)
 118 {
 119   if (!pxeRoot || stricmp(pxeRoot->Value(), "error"))
 120     return;
 121   CStdString sTitle;
 122   CStdString sMessage;
 123   XMLUtils::GetString(pxeRoot, "title", sTitle);
 124   XMLUtils::GetString(pxeRoot, "message", sMessage);
 125   throw CScraperError(sTitle, sMessage);
 126 }
 127
 128 CScraper::CScraper(const cp_extension_t *ext) : CAddon(ext), m_fLoaded(false)
 129 {
 130   if (ext)
 131   {
 132     m_language = CAddonMgr::Get().GetExtValue(ext->configuration, "@language");
 133     m_requiressettings = CAddonMgr::Get().GetExtValue(ext->configuration,"@requiressettings").Equals("true");
 134     CStdString persistence = CAddonMgr::Get().GetExtValue(ext->configuration, "@cachepersistence");
 135     if (!persistence.empty())
 136       m_persistence.SetFromTimeString(persistence);
 137   }
 138   switch (Type())
 139   {
 140     case ADDON_SCRAPER_ALBUMS:
 141       m_pathContent = CONTENT_ALBUMS;
 142       break;
 143     case ADDON_SCRAPER_ARTISTS:
 144       m_pathContent = CONTENT_ARTISTS;
 145       break;
 146     case ADDON_SCRAPER_MOVIES:
 147       m_pathContent = CONTENT_MOVIES;
 148       break;
 149     case ADDON_SCRAPER_MUSICVIDEOS:
 150       m_pathContent = CONTENT_MUSICVIDEOS;
 151       break;
 152     case ADDON_SCRAPER_TVSHOWS:
 153       m_pathContent = CONTENT_TVSHOWS;
 154       break;
 155     default:
 156       m_pathContent = CONTENT_NONE;
 157       break;
 158   }
 159 }
 160
 161 AddonPtr CScraper::Clone() const
 162 {
 163   return AddonPtr(new CScraper(*this));
 164 }
 165
 166 CScraper::CScraper(const CScraper &rhs)
 167   : CAddon(rhs), m_fLoaded(false)
 168 {
 169   m_pathContent = rhs.m_pathContent;
 170   m_persistence = rhs.m_persistence;
 171   m_requiressettings = rhs.m_requiressettings;
 172   m_language = rhs.m_language;
 173 }
 174
 175 bool CScraper::Supports(const CONTENT_TYPE &content) const
 176 {
 177   return Type() == ScraperTypeFromContent(content);
 178 }
 179
 180 bool CScraper::SetPathSettings(CONTENT_TYPE content, const CStdString& xml)
 181 {
 182   m_pathContent = content;
 183   if (!LoadSettings())
 184     return false;
 185
 186   if (xml.empty())
 187     return true;
 188
 189   CXBMCTinyXML doc;
 190   doc.Parse(xml);
 191   m_userSettingsLoaded = SettingsFromXML(doc);
 192
 193   return m_userSettingsLoaded;
 194 }
 195
 196 CStdString CScraper::GetPathSettings()
 197 {
 198   if (!LoadSettings())
 199     return "";
 200
 201   stringstream stream;
 202   CXBMCTinyXML doc;
 203   SettingsToXML(doc);
 204   if (doc.RootElement())
 205     stream << *doc.RootElement();
 206
 207   return stream.str();
 208 }
 209
 210 void CScraper::ClearCache()
 211 {
 212   CStdString strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers");
 213
 214   // create scraper cache dir if needed
 215   if (!CDirectory::Exists(strCachePath))
 216     CDirectory::Create(strCachePath);
 217
 218   strCachePath = URIUtils::AddFileToFolder(strCachePath, ID());
 219   URIUtils::AddSlashAtEnd(strCachePath);
 220
 221   if (CDirectory::Exists(strCachePath))
 222   {
 223     CFileItemList items;
 224     CDirectory::GetDirectory(strCachePath,items);
 225     for (int i=0;i<items.Size();++i)
 226     {
 227       // wipe cache
 228       if (items[i]->m_dateTime + m_persistence <= CDateTime::GetCurrentDateTime())
 229         CFile::Delete(items[i]->GetPath());
 230     }
 231   }
 232   else
 233     CDirectory::Create(strCachePath);
 234 }
 235
 236 // returns a vector of strings: the first is the XML output by the function; the rest
 237 // is XML output by chained functions, possibly recursively
 238 // the CCurlFile object is passed in so that URL fetches can be canceled from other threads
 239 // throws CScraperError abort on internal failures (e.g., parse errors)
 240 vector<CStdString> CScraper::Run(const CStdString& function,
 241                                  const CScraperUrl& scrURL,
 242                                  CCurlFile& http,
 243                                  const vector<CStdString>* extras)
 244 {
 245   if (!Load())
 246     throw CScraperError();
 247
 248   CStdString strXML = InternalRun(function,scrURL,http,extras);
 249   if (strXML.empty())
 250   {
 251     if (function != "NfoUrl" && function != "ResolveIDToUrl")
 252       CLog::Log(LOGERROR, "%s: Unable to parse web site",__FUNCTION__);
 253     throw CScraperError();
 254   }
 255
 256   CLog::Log(LOGDEBUG,"scraper: %s returned %s",function.c_str(),strXML.c_str());
 257
 258   CXBMCTinyXML doc;
 259   doc.Parse(strXML, TIXML_ENCODING_UNKNOWN);
 260   if (!doc.RootElement())
 261   {
 262     CLog::Log(LOGERROR, "%s: Unable to parse XML",__FUNCTION__);
 263     throw CScraperError();
 264   }
 265
 266   vector<CStdString> result;
 267   result.push_back(strXML);
 268   TiXmlElement* xchain = doc.RootElement()->FirstChildElement();
 269   // skip children of the root element until <url> or <chain>
 270   while (xchain && strcmp(xchain->Value(),"url") && strcmp(xchain->Value(),"chain"))
 271       xchain = xchain->NextSiblingElement();
 272   while (xchain)
 273   {
 274     // <chain|url function="...">param</>
 275     const char* szFunction = xchain->Attribute("function");
 276     if (szFunction)
 277     {
 278       CScraperUrl scrURL2;
 279       vector<CStdString> extras;
 280       // for <chain>, pass the contained text as a parameter; for <url>, as URL content
 281       if (strcmp(xchain->Value(),"chain")==0)
 282       {
 283         if (xchain->FirstChild())
 284           extras.push_back(xchain->FirstChild()->Value());
 285       }
 286       else
 287         scrURL2.ParseElement(xchain);
 288       // Fix for empty chains. $$1 would still contain the
 289       // previous value as there is no child of the xml node.
 290       // since $$1 will always either contain the data from an
 291       // url or the parameters to a chain, we can safely clear it here
 292       // to fix this issue
 293       m_parser.m_param[0].clear();
 294       vector<CStdString> result2 = RunNoThrow(szFunction,scrURL2,http,&extras);
 295       result.insert(result.end(),result2.begin(),result2.end());
 296     }
 297     xchain = xchain->NextSiblingElement();
 298     // continue to skip past non-<url> or <chain> elements
 299     while (xchain && strcmp(xchain->Value(),"url") && strcmp(xchain->Value(),"chain"))
 300       xchain = xchain->NextSiblingElement();
 301   }
 302
 303   return result;
 304 }
 305
 306 // just like Run, but returns an empty list instead of throwing in case of error
 307 // don't use in new code; errors should be handled appropriately
 308 std::vector<CStdString> CScraper::RunNoThrow(const CStdString& function,
 309   const CScraperUrl& url,
 310   XFILE::CCurlFile& http,
 311   const std::vector<CStdString>* extras)
 312 {
 313   std::vector<CStdString> vcs;
 314   try
 315   {
 316     vcs = Run(function, url, http, extras);
 317   }
 318   catch (const CScraperError &sce)
 319   {
 320     ASSERT(sce.FAborted());  // the only kind we should get
 321   }
 322   return vcs;
 323 }
 324
 325 CStdString CScraper::InternalRun(const CStdString& function,
 326                                  const CScraperUrl& scrURL,
 327                                  CCurlFile& http,
 328                                  const vector<CStdString>* extras)
 329 {
 330   // walk the list of input URLs and fetch each into parser parameters
 331   unsigned int i;
 332   for (i=0;i<scrURL.m_url.size();++i)
 333   {
 334     CStdString strCurrHTML;
 335     if (!CScraperUrl::Get(scrURL.m_url[i],m_parser.m_param[i],http,ID()) || m_parser.m_param[i].size() == 0)
 336       return "";
 337   }
 338   // put the 'extra' parameterts into the parser parameter list too
 339   if (extras)
 340   {
 341     for (unsigned int j=0;j<extras->size();++j)
 342       m_parser.m_param[j+i] = (*extras)[j];
 343   }
 344
 345   return m_parser.Parse(function,this);
 346 }
 347
 348 bool CScraper::Load()
 349 {
 350   if (m_fLoaded)
 351     return true;
 352
 353   bool result=m_parser.Load(LibPath());
 354   if (result)
 355   {
 356     // TODO: this routine assumes that deps are a single level, and assumes the dep is installed.
 357     //       1. Does it make sense to have recursive dependencies?
 358     //       2. Should we be checking the dep versions or do we assume it is ok?
 359     ADDONDEPS deps = GetDeps();
 360     ADDONDEPS::iterator itr = deps.begin();
 361     while (itr != deps.end())
 362     {
 363       if (itr->first.Equals("xbmc.metadata"))
 364       {
 365         ++itr;
 366         continue;
 367       }
 368       AddonPtr dep;
 369
 370       bool bOptional = itr->second.second;
 371
 372       if (CAddonMgr::Get().GetAddon((*itr).first, dep))
 373       {
 374         CXBMCTinyXML doc;
 375         if (dep->Type() == ADDON_SCRAPER_LIBRARY && doc.LoadFile(dep->LibPath()))
 376           m_parser.AddDocument(&doc);
 377       }
 378       else
 379       {
 380         if (!bOptional)
 381         {
 382           result = false;
 383           break;
 384         }
 385       }
 386       itr++;
 387     }
 388   }
 389
 390   if (!result)
 391     CLog::Log(LOGWARNING, "failed to load scraper XML");
 392   return m_fLoaded = result;
 393 }
 394
 395 bool CScraper::IsInUse() const
 396 {
 397   if (Supports(CONTENT_ALBUMS) || Supports(CONTENT_ARTISTS))
 398   { // music scraper
 399     CMusicDatabase db;
 400     if (db.Open() && db.ScraperInUse(ID()))
 401       return true;
 402   }
 403   else
 404   { // video scraper
 405     CVideoDatabase db;
 406     if (db.Open() && db.ScraperInUse(ID()))
 407       return true;
 408   }
 409   return false;
 410 }
 411
 412 bool CScraper::IsNoop()
 413 {
 414     if (!Load())
 415       throw CScraperError();
 416
 417     return m_parser.IsNoop();
 418 }
 419
 420 // pass in contents of .nfo file; returns URL (possibly empty if none found)
 421 // and may populate strId, or throws CScraperError on error
 422 CScraperUrl CScraper::NfoUrl(const CStdString &sNfoContent)
 423 {
 424   CScraperUrl scurlRet;
 425
 426   if (IsNoop())
 427     return scurlRet;
 428
 429   // scraper function takes contents of .nfo file, returns XML (see below)
 430   vector<CStdString> vcsIn;
 431   vcsIn.push_back(sNfoContent);
 432   CScraperUrl scurl;
 433   CCurlFile fcurl;
 434   vector<CStdString> vcsOut = Run("NfoUrl", scurl, fcurl, &vcsIn);
 435   if (vcsOut.empty() || vcsOut[0].empty())
 436     return scurlRet;
 437   if (vcsOut.size() > 1)
 438     CLog::Log(LOGWARNING, "%s: scraper returned multiple results; using first", __FUNCTION__);
 439
 440   // parse returned XML: either <error> element on error, blank on failure,
 441   // or <url>...</url> or <url>...</url><id>...</id> on success
 442   for (unsigned int i=0; i < vcsOut.size(); ++i)
 443   {
 444     CXBMCTinyXML doc;
 445     doc.Parse(vcsOut[i], TIXML_ENCODING_UTF8);
 446     CheckScraperError(doc.RootElement());
 447
 448     if (doc.RootElement())
 449     {
 450       /*
 451        NOTE: Scrapers might return invalid xml with some loose
 452        elements (eg. '<url>http://some.url</url><id>123</id>').
 453        Since XMLUtils::GetString() is assuming well formed xml
 454        with start and end-tags we're not able to use it.
 455        Check for the desired Elements instead.
 456       */
 457       TiXmlElement* pxeUrl=NULL;
 458       TiXmlElement* pId=NULL;
 459       if (!strcmp(doc.RootElement()->Value(),"details"))
 460       {
 461         pxeUrl = doc.RootElement()->FirstChildElement("url");
 462         pId = doc.RootElement()->FirstChildElement("id");
 463       }
 464       else
 465       {
 466         pId = doc.FirstChildElement("id");
 467         pxeUrl = doc.FirstChildElement("url");
 468       }
 469       if (pId && pId->FirstChild())
 470         scurlRet.strId = pId->FirstChild()->Value();
 471
 472       if (pxeUrl && pxeUrl->Attribute("function"))
 473         continue;
 474
 475       if (pxeUrl)
 476         scurlRet.ParseElement(pxeUrl);
 477       else if (!strcmp(doc.RootElement()->Value(), "url"))
 478         scurlRet.ParseElement(doc.RootElement());
 479       else
 480         continue;
 481       break;
 482     }
 483   }
 484   return scurlRet;
 485 }
 486
 487 CScraperUrl CScraper::ResolveIDToUrl(const CStdString& externalID)
 488 {
 489   CScraperUrl scurlRet;
 490
 491   // scraper function takes an external ID, returns XML (see below)
 492   vector<CStdString> vcsIn;
 493   vcsIn.push_back(externalID);
 494   CScraperUrl scurl;
 495   CCurlFile fcurl;
 496   vector<CStdString> vcsOut = Run("ResolveIDToUrl", scurl, fcurl, &vcsIn);
 497   if (vcsOut.empty() || vcsOut[0].empty())
 498     return scurlRet;
 499   if (vcsOut.size() > 1)
 500     CLog::Log(LOGWARNING, "%s: scraper returned multiple results; using first", __FUNCTION__);
 501
 502   // parse returned XML: either <error> element on error, blank on failure,
 503   // or <url>...</url> or <url>...</url><id>...</id> on success
 504   for (unsigned int i=0; i < vcsOut.size(); ++i)
 505   {
 506     CXBMCTinyXML doc;
 507     doc.Parse(vcsOut[i], TIXML_ENCODING_UTF8);
 508     CheckScraperError(doc.RootElement());
 509
 510     if (doc.RootElement())
 511     {
 512       /*
 513        NOTE: Scrapers might return invalid xml with some loose
 514        elements (eg. '<url>http://some.url</url><id>123</id>').
 515        Since XMLUtils::GetString() is assuming well formed xml
 516        with start and end-tags we're not able to use it.
 517        Check for the desired Elements instead.
 518        */
 519       TiXmlElement* pxeUrl=NULL;
 520       TiXmlElement* pId=NULL;
 521       if (!strcmp(doc.RootElement()->Value(),"details"))
 522       {
 523         pxeUrl = doc.RootElement()->FirstChildElement("url");
 524         pId = doc.RootElement()->FirstChildElement("id");
 525       }
 526       else
 527       {
 528         pId = doc.FirstChildElement("id");
 529         pxeUrl = doc.FirstChildElement("url");
 530       }
 531       if (pId && pId->FirstChild())
 532         scurlRet.strId = pId->FirstChild()->Value();
 533
 534       if (pxeUrl && pxeUrl->Attribute("function"))
 535         continue;
 536
 537       if (pxeUrl)
 538         scurlRet.ParseElement(pxeUrl);
 539       else if (!strcmp(doc.RootElement()->Value(), "url"))
 540         scurlRet.ParseElement(doc.RootElement());
 541       else
 542         continue;
 543       break;
 544     }
 545   }
 546   return scurlRet;
 547 }
 548
 549 static bool RelevanceSortFunction(const CScraperUrl &left, const CScraperUrl &right)
 550 {
 551   return left.relevance > right.relevance;
 552 }
 553
 554 // fetch list of matching movies sorted by relevance (may be empty);
 555 // throws CScraperError on error; first called with fFirst set, then unset if first try fails
 556 std::vector<CScraperUrl> CScraper::FindMovie(XFILE::CCurlFile &fcurl, const CStdString &sMovie,
 557   bool fFirst)
 558 {
 559   // prepare parameters for URL creation
 560   CStdString sTitle, sTitleYear, sYear;
 561   CUtil::CleanString(sMovie, sTitle, sTitleYear, sYear, true/*fRemoveExt*/, fFirst);
 562
 563   CLog::Log(LOGDEBUG, "%s: Searching for '%s' using %s scraper "
 564     "(path: '%s', content: '%s', version: '%s')", __FUNCTION__, sTitle.c_str(),
 565     Name().c_str(), Path().c_str(),
 566     ADDON::TranslateContent(Content()).c_str(), Version().c_str());
 567
 568   std::vector<CScraperUrl> vcscurl;
 569   if (IsNoop())
 570     return vcscurl;
 571
 572   if (!fFirst)
 573     StringUtils::Replace(sTitle, '-',' ');
 574
 575   StringUtils::ToLower(sTitle);
 576
 577   vector<CStdString> vcsIn(1);
 578   g_charsetConverter.utf8To(SearchStringEncoding(), sTitle, vcsIn[0]);
 579   CURL::Encode(vcsIn[0]);
 580   if (!sYear.empty())
 581     vcsIn.push_back(sYear);
 582
 583   // request a search URL from the title/filename/etc.
 584   CScraperUrl scurl;
 585   vector<CStdString> vcsOut = Run("CreateSearchUrl", scurl, fcurl, &vcsIn);
 586   if (vcsOut.empty())
 587   {
 588     CLog::Log(LOGDEBUG, "%s: CreateSearchUrl failed", __FUNCTION__);
 589     throw CScraperError();
 590   }
 591   scurl.ParseString(vcsOut[0]);
 592
 593   // do the search, and parse the result into a list
 594   vcsIn.clear();
 595   vcsIn.push_back(scurl.m_url[0].m_url);
 596   vcsOut = Run("GetSearchResults", scurl, fcurl, &vcsIn);
 597
 598   bool fSort(true);
 599   std::set<CStdString> stsDupeCheck;
 600   bool fResults(false);
 601   for (CStdStringArray::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
 602   {
 603     CXBMCTinyXML doc;
 604     doc.Parse(*i, TIXML_ENCODING_UTF8);
 605     if (!doc.RootElement())
 606     {
 607       CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
 608       continue;  // might have more valid results later
 609     }
 610
 611     CheckScraperError(doc.RootElement());
 612
 613     TiXmlHandle xhDoc(&doc);
 614     TiXmlHandle xhResults = xhDoc.FirstChild("results");
 615     if (!xhResults.Element())
 616       continue;
 617     fResults = true;  // even if empty
 618
 619     // we need to sort if returned results don't specify 'sorted="yes"'
 620     if (fSort)
 621     {
 622       const char *sorted = xhResults.Element()->Attribute("sorted");
 623       if (sorted != NULL)
 624         fSort = !StringUtils::EqualsNoCase(sorted, "yes");
 625     }
 626
 627     for (TiXmlElement *pxeMovie = xhResults.FirstChild("entity").Element();
 628       pxeMovie; pxeMovie = pxeMovie->NextSiblingElement())
 629     {
 630       CScraperUrl scurlMovie;
 631       TiXmlNode *pxnTitle = pxeMovie->FirstChild("title");
 632       TiXmlElement *pxeLink = pxeMovie->FirstChildElement("url");
 633       if (pxnTitle && pxnTitle->FirstChild() && pxeLink && pxeLink->FirstChild())
 634       {
 635         scurlMovie.strTitle = pxnTitle->FirstChild()->Value();
 636         XMLUtils::GetString(pxeMovie, "id", scurlMovie.strId);
 637
 638         for ( ; pxeLink && pxeLink->FirstChild(); pxeLink = pxeLink->NextSiblingElement("url"))
 639           scurlMovie.ParseElement(pxeLink);
 640
 641         // calculate the relavance of this hit
 642         CStdString sCompareTitle = scurlMovie.strTitle;
 643         StringUtils::ToLower(sCompareTitle);
 644         CStdString sMatchTitle = sTitle;
 645         StringUtils::ToLower(sMatchTitle);
 646
 647         /*
 648          * Identify the best match by performing a fuzzy string compare on the search term and
 649          * the result. Additionally, use the year (if available) to further refine the best match.
 650          * An exact match scores 1, a match off by a year scores 0.5 (release dates can vary between
 651          * countries), otherwise it scores 0.
 652          */
 653         CStdString sCompareYear;
 654         XMLUtils::GetString(pxeMovie, "year", sCompareYear);
 655
 656         double yearScore = 0;
 657         if (!sYear.empty() && !sCompareYear.empty())
 658           yearScore = std::max(0.0, 1-0.5*abs(atoi(sYear)-atoi(sCompareYear)));
 659
 660         scurlMovie.relevance = fstrcmp(sMatchTitle.c_str(), sCompareTitle.c_str(), 0.0) + yearScore;
 661
 662         // reconstruct a title for the user
 663         if (!sCompareYear.empty())
 664           scurlMovie.strTitle += StringUtils::Format(" (%s)", sCompareYear.c_str());
 665
 666         CStdString sLanguage;
 667         if (XMLUtils::GetString(pxeMovie, "language", sLanguage) && !sLanguage.empty())
 668           scurlMovie.strTitle += StringUtils::Format(" (%s)", sLanguage.c_str());
 669
 670         // filter for dupes from naughty scrapers
 671         if (stsDupeCheck.insert(scurlMovie.m_url[0].m_url + " " + scurlMovie.strTitle).second)
 672           vcscurl.push_back(scurlMovie);
 673       }
 674     }
 675   }
 676
 677   if (!fResults)
 678     throw CScraperError();  // scraper aborted
 679
 680   if (fSort)
 681     std::stable_sort(vcscurl.begin(), vcscurl.end(), RelevanceSortFunction);
 682
 683   return vcscurl;
 684 }
 685
 686 // find album by artist, using fcurl for web fetches
 687 // returns a list of albums (empty if no match or failure)
 688 std::vector<CMusicAlbumInfo> CScraper::FindAlbum(CCurlFile &fcurl, const CStdString &sAlbum,
 689   const CStdString &sArtist)
 690 {
 691   CLog::Log(LOGDEBUG, "%s: Searching for '%s - %s' using %s scraper "
 692     "(path: '%s', content: '%s', version: '%s')", __FUNCTION__, sArtist.c_str(),
 693     sAlbum.c_str(), Name().c_str(), Path().c_str(),
 694     ADDON::TranslateContent(Content()).c_str(), Version().c_str());
 695
 696   std::vector<CMusicAlbumInfo> vcali;
 697   if (IsNoop())
 698     return vcali;
 699
 700   // scraper function is given the album and artist as parameters and
 701   // returns an XML <url> element parseable by CScraperUrl
 702   std::vector<CStdString> extras(2);
 703   g_charsetConverter.utf8To(SearchStringEncoding(), sAlbum, extras[0]);
 704   g_charsetConverter.utf8To(SearchStringEncoding(), sArtist, extras[1]);
 705   CURL::Encode(extras[0]);
 706   CURL::Encode(extras[1]);
 707   CScraperUrl scurl;
 708   vector<CStdString> vcsOut = RunNoThrow("CreateAlbumSearchUrl", scurl, fcurl, &extras);
 709   if (vcsOut.size() > 1)
 710     CLog::Log(LOGWARNING, "%s: scraper returned multiple results; using first", __FUNCTION__);
 711
 712   if (vcsOut.empty() || vcsOut[0].empty())
 713     return vcali;
 714   scurl.ParseString(vcsOut[0]);
 715
 716   // the next function is passed the contents of the returned URL, and returns
 717   // an empty string on failure; on success, returns XML matches in the form:
 718   // <results>
 719   //  <entity>
 720   //   <title>...</title>
 721   //   <url>...</url> (with the usual CScraperUrl decorations like post or spoof)
 722   //   <artist>...</artist>
 723   //   <year>...</year>
 724   //   <relevance [scale="..."]>...</relevance> (scale defaults to 1; score is divided by it)
 725   //  </entity>
 726   //  ...
 727   // </results>
 728   vcsOut = RunNoThrow("GetAlbumSearchResults", scurl, fcurl);
 729
 730   // parse the returned XML into a vector of album objects
 731   for (CStdStringArray::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
 732   {
 733     CXBMCTinyXML doc;
 734     doc.Parse(*i, TIXML_ENCODING_UTF8);
 735     TiXmlHandle xhDoc(&doc);
 736
 737     for (TiXmlElement* pxeAlbum = xhDoc.FirstChild("results").FirstChild("entity").Element();
 738       pxeAlbum; pxeAlbum = pxeAlbum->NextSiblingElement())
 739     {
 740       CStdString sTitle;
 741       if (XMLUtils::GetString(pxeAlbum, "title", sTitle) && !sTitle.empty())
 742       {
 743         CStdString sArtist;
 744         CStdString sAlbumName;
 745         if (XMLUtils::GetString(pxeAlbum, "artist", sArtist) && !sArtist.empty())
 746           sAlbumName = StringUtils::Format("%s - %s", sArtist.c_str(), sTitle.c_str());
 747         else
 748           sAlbumName = sTitle;
 749
 750         CStdString sYear;
 751         if (XMLUtils::GetString(pxeAlbum, "year", sYear) && !sYear.empty())
 752           sAlbumName = StringUtils::Format("%s (%s)", sAlbumName.c_str(), sYear.c_str());
 753
 754         // if no URL is provided, use the URL we got back from CreateAlbumSearchUrl
 755         // (e.g., in case we only got one result back and were sent to the detail page)
 756         TiXmlElement* pxeLink = pxeAlbum->FirstChildElement("url");
 757         CScraperUrl scurlAlbum;
 758         if (!pxeLink)
 759           scurlAlbum.ParseString(scurl.m_xml);
 760         for ( ; pxeLink && pxeLink->FirstChild(); pxeLink = pxeLink->NextSiblingElement("url"))
 761           scurlAlbum.ParseElement(pxeLink);
 762
 763         if (!scurlAlbum.m_url.size())
 764           continue;
 765
 766         CMusicAlbumInfo ali(sTitle, sArtist, sAlbumName, scurlAlbum);
 767
 768         TiXmlElement* pxeRel = pxeAlbum->FirstChildElement("relevance");
 769         if (pxeRel && pxeRel->FirstChild())
 770         {
 771           const char* szScale = pxeRel->Attribute("scale");
 772           float flScale = szScale ? float(atof(szScale)) : 1;
 773           ali.SetRelevance(float(atof(pxeRel->FirstChild()->Value())) / flScale);
 774         }
 775
 776         vcali.push_back(ali);
 777       }
 778     }
 779   }
 780   return vcali;
 781 }
 782
 783 // find artist, using fcurl for web fetches
 784 // returns a list of artists (empty if no match or failure)
 785 std::vector<CMusicArtistInfo> CScraper::FindArtist(CCurlFile &fcurl,
 786   const CStdString &sArtist)
 787 {
 788   CLog::Log(LOGDEBUG, "%s: Searching for '%s' using %s scraper "
 789     "(file: '%s', content: '%s', version: '%s')", __FUNCTION__, sArtist.c_str(),
 790     Name().c_str(), Path().c_str(),
 791     ADDON::TranslateContent(Content()).c_str(), Version().c_str());
 792
 793   std::vector<CMusicArtistInfo> vcari;
 794   if (IsNoop())
 795     return vcari;
 796
 797   // scraper function is given the artist as parameter and
 798   // returns an XML <url> element parseable by CScraperUrl
 799   std::vector<CStdString> extras(1);
 800   g_charsetConverter.utf8To(SearchStringEncoding(), sArtist, extras[0]);
 801   CURL::Encode(extras[0]);
 802   CScraperUrl scurl;
 803   vector<CStdString> vcsOut = RunNoThrow("CreateArtistSearchUrl", scurl, fcurl, &extras);
 804
 805   if (vcsOut.empty() || vcsOut[0].empty())
 806     return vcari;
 807   scurl.ParseString(vcsOut[0]);
 808
 809   // the next function is passed the contents of the returned URL, and returns
 810   // an empty string on failure; on success, returns XML matches in the form:
 811   // <results>
 812   //  <entity>
 813   //   <title>...</title>
 814   //   <year>...</year>
 815   //   <genre>...</genre>
 816   //   <url>...</url> (with the usual CScraperUrl decorations like post or spoof)
 817   //  </entity>
 818   //  ...
 819   // </results>
 820   vcsOut = RunNoThrow("GetArtistSearchResults", scurl, fcurl);
 821
 822   // parse the returned XML into a vector of artist objects
 823   for (CStdStringArray::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
 824   {
 825     CXBMCTinyXML doc;
 826     doc.Parse(*i, TIXML_ENCODING_UTF8);
 827     if (!doc.RootElement())
 828     {
 829       CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
 830       return vcari;
 831     }
 832     TiXmlHandle xhDoc(&doc);
 833     for (TiXmlElement* pxeArtist = xhDoc.FirstChild("results").FirstChild("entity").Element();
 834       pxeArtist; pxeArtist = pxeArtist->NextSiblingElement())
 835     {
 836       TiXmlNode* pxnTitle = pxeArtist->FirstChild("title");
 837       if (pxnTitle && pxnTitle->FirstChild())
 838       {
 839         CScraperUrl scurlArtist;
 840
 841         TiXmlElement* pxeLink = pxeArtist->FirstChildElement("url");
 842         if (!pxeLink)
 843           scurlArtist.ParseString(scurl.m_xml);
 844         for ( ; pxeLink && pxeLink->FirstChild(); pxeLink = pxeLink->NextSiblingElement("url"))
 845           scurlArtist.ParseElement(pxeLink);
 846
 847         if (!scurlArtist.m_url.size())
 848           continue;
 849
 850         CMusicArtistInfo ari(pxnTitle->FirstChild()->Value(), scurlArtist);
 851         CStdString genre;
 852         XMLUtils::GetString(pxeArtist, "genre", genre);
 853         if (!genre.empty())
 854           ari.GetArtist().genre = StringUtils::Split(genre, g_advancedSettings.m_musicItemSeparator);
 855         XMLUtils::GetString(pxeArtist, "year", ari.GetArtist().strBorn);
 856
 857         vcari.push_back(ari);
 858       }
 859     }
 860   }
 861   return vcari;
 862 }
 863
 864 // fetch list of episodes from URL (from video database)
 865 EPISODELIST CScraper::GetEpisodeList(XFILE::CCurlFile &fcurl, const CScraperUrl &scurl)
 866 {
 867   EPISODELIST vcep;
 868   if (scurl.m_url.empty())
 869     return vcep;
 870
 871   CLog::Log(LOGDEBUG, "%s: Searching '%s' using %s scraper "
 872     "(file: '%s', content: '%s', version: '%s')", __FUNCTION__,
 873     scurl.m_url[0].m_url.c_str(), Name().c_str(), Path().c_str(),
 874     ADDON::TranslateContent(Content()).c_str(), Version().c_str());
 875
 876   vector<CStdString> vcsIn;
 877   vcsIn.push_back(scurl.m_url[0].m_url);
 878   vector<CStdString> vcsOut = RunNoThrow("GetEpisodeList", scurl, fcurl, &vcsIn);
 879
 880   // parse the XML response
 881   for (CStdStringArray::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
 882   {
 883     CXBMCTinyXML doc;
 884     doc.Parse(*i);
 885     if (!doc.RootElement())
 886     {
 887       CLog::Log(LOGERROR, "%s: Unable to parse XML",__FUNCTION__);
 888       continue;
 889     }
 890
 891     TiXmlHandle xhDoc(&doc);
 892     for (TiXmlElement *pxeMovie = xhDoc.FirstChild("episodeguide").FirstChild("episode").
 893       Element(); pxeMovie; pxeMovie = pxeMovie->NextSiblingElement())
 894     {
 895       EPISODE ep;
 896       TiXmlElement *pxeLink = pxeMovie->FirstChildElement("url");
 897       CStdString strEpNum;
 898       if (pxeLink && XMLUtils::GetInt(pxeMovie, "season", ep.iSeason) &&
 899         XMLUtils::GetString(pxeMovie, "epnum", strEpNum) && !strEpNum.empty())
 900       {
 901         CScraperUrl &scurlEp(ep.cScraperUrl);
 902         size_t dot = strEpNum.find(".");
 903         ep.iEpisode = atoi(strEpNum.c_str());
 904         ep.iSubepisode = (dot != std::string::npos) ? atoi(strEpNum.substr(dot + 1).c_str()) : 0;
 905         if (!XMLUtils::GetString(pxeMovie, "title", scurlEp.strTitle) || scurlEp.strTitle.empty() )
 906             scurlEp.strTitle = g_localizeStrings.Get(416);
 907         XMLUtils::GetString(pxeMovie, "id", scurlEp.strId);
 908
 909         for ( ; pxeLink && pxeLink->FirstChild(); pxeLink = pxeLink->NextSiblingElement("url"))
 910           scurlEp.ParseElement(pxeLink);
 911
 912         // date must be the format of yyyy-mm-dd
 913         ep.cDate.SetValid(FALSE);
 914         CStdString sDate;
 915         if (XMLUtils::GetString(pxeMovie, "aired", sDate) && sDate.length() == 10)
 916         {
 917           tm tm;
 918           if (strptime(sDate, "%Y-%m-%d", &tm))
 919             ep.cDate.SetDate(1900+tm.tm_year, tm.tm_mon + 1, tm.tm_mday);
 920         }
 921         vcep.push_back(ep);
 922       }
 923     }
 924   }
 925
 926   return vcep;
 927 }
 928
 929 // takes URL; returns true and populates video details on success, false otherwise
 930 bool CScraper::GetVideoDetails(XFILE::CCurlFile &fcurl, const CScraperUrl &scurl,
 931   bool fMovie/*else episode*/, CVideoInfoTag &video)
 932 {
 933   CLog::Log(LOGDEBUG, "%s: Reading %s '%s' using %s scraper "
 934     "(file: '%s', content: '%s', version: '%s')", __FUNCTION__,
 935     fMovie ? "movie" : "episode", scurl.m_url[0].m_url.c_str(), Name().c_str(), Path().c_str(),
 936     ADDON::TranslateContent(Content()).c_str(), Version().c_str());
 937
 938   video.Reset();
 939   CStdString sFunc = fMovie ? "GetDetails" : "GetEpisodeDetails";
 940   vector<CStdString> vcsIn;
 941   vcsIn.push_back(scurl.strId);
 942   vcsIn.push_back(scurl.m_url[0].m_url);
 943   vector<CStdString> vcsOut = RunNoThrow(sFunc, scurl, fcurl, &vcsIn);
 944
 945   // parse XML output
 946   bool fRet(false);
 947   for (CStdStringArray::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
 948   {
 949     CXBMCTinyXML doc;
 950     doc.Parse(*i, TIXML_ENCODING_UTF8);
 951     if (!doc.RootElement())
 952     {
 953       CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
 954       continue;
 955     }
 956
 957     TiXmlHandle xhDoc(&doc);
 958     TiXmlElement *pxeDetails = xhDoc.FirstChild("details").Element();
 959     if (!pxeDetails)
 960     {
 961       CLog::Log(LOGERROR, "%s: Invalid XML file (want <details>)", __FUNCTION__);
 962       continue;
 963     }
 964     video.Load(pxeDetails, true/*fChain*/);
 965     fRet = true;  // but don't exit in case of chaining
 966   }
 967   return fRet;
 968 }
 969
 970 // takes a URL; returns true and populates album on success, false otherwise
 971 bool CScraper::GetAlbumDetails(CCurlFile &fcurl, const CScraperUrl &scurl, CAlbum &album)
 972 {
 973   CLog::Log(LOGDEBUG, "%s: Reading '%s' using %s scraper "
 974     "(file: '%s', content: '%s', version: '%s')", __FUNCTION__,
 975     scurl.m_url[0].m_url.c_str(), Name().c_str(), Path().c_str(),
 976     ADDON::TranslateContent(Content()).c_str(), Version().c_str());
 977
 978   vector<CStdString> vcsOut = RunNoThrow("GetAlbumDetails", scurl, fcurl);
 979
 980   // parse the returned XML into an album object (see CAlbum::Load for details)
 981   bool fRet(false);
 982   for (CStdStringArray::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
 983   {
 984     CXBMCTinyXML doc;
 985     doc.Parse(*i, TIXML_ENCODING_UTF8);
 986     if (!doc.RootElement())
 987     {
 988       CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
 989       return false;
 990     }
 991     fRet = album.Load(doc.RootElement(), i != vcsOut.begin());
 992   }
 993   return fRet;
 994 }
 995
 996 // takes a URL (one returned from FindArtist), the original search string, and
 997 // returns true and populates artist on success, false on failure
 998 bool CScraper::GetArtistDetails(CCurlFile &fcurl, const CScraperUrl &scurl,
 999   const CStdString &sSearch, CArtist &artist)
1000 {
1001   if (!scurl.m_url.size())
1002     return false;
1003
1004   CLog::Log(LOGDEBUG, "%s: Reading '%s' ('%s') using %s scraper "
1005     "(file: '%s', content: '%s', version: '%s')", __FUNCTION__,
1006     scurl.m_url[0].m_url.c_str(), sSearch.c_str(), Name().c_str(), Path().c_str(),
1007     ADDON::TranslateContent(Content()).c_str(), Version().c_str());
1008
1009   // pass in the original search string for chaining to search other sites
1010   vector<CStdString> vcIn;
1011   vcIn.push_back(sSearch);
1012   CURL::Encode(vcIn[0]);
1013
1014   vector<CStdString> vcsOut = RunNoThrow("GetArtistDetails", scurl, fcurl, &vcIn);
1015
1016   // ok, now parse the xml file
1017   bool fRet(false);
1018   for (vector<CStdString>::const_iterator i = vcsOut.begin(); i != vcsOut.end(); ++i)
1019   {
1020     CXBMCTinyXML doc;
1021     doc.Parse(*i, TIXML_ENCODING_UTF8);
1022     if (!doc.RootElement())
1023     {
1024       CLog::Log(LOGERROR, "%s: Unable to parse XML", __FUNCTION__);
1025       return false;
1026     }
1027
1028     fRet = artist.Load(doc.RootElement(), i != vcsOut.begin());
1029   }
1030   return fRet;
1031 }
1032
1033 }
1034