code.vuplus.com Git - vuplus_xbmc/blob - xbmc/utils/ScraperUrl.cpp

   1 /*
   2  *      Copyright (C) 2005-2013 Team XBMC
   3  *      http://xbmc.org
   4  *
   5  *  This Program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2, or (at your option)
   8  *  any later version.
   9  *
  10  *  This Program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with XBMC; see the file COPYING.  If not, see
  17  *  <http://www.gnu.org/licenses/>.
  18  *
  19  */
  20
  21 #include "XMLUtils.h"
  22 #include "ScraperUrl.h"
  23 #include "settings/AdvancedSettings.h"
  24 #include "HTMLUtil.h"
  25 #include "CharsetConverter.h"
  26 #include "utils/CharsetDetection.h"
  27 #include "utils/StringUtils.h"
  28 #include "URL.h"
  29 #include "filesystem/CurlFile.h"
  30 #include "filesystem/ZipFile.h"
  31 #include "URIUtils.h"
  32 #include "utils/XBMCTinyXML.h"
  33 #include "utils/Mime.h"
  34
  35 #include <cstring>
  36 #include <sstream>
  37
  38 using namespace std;
  39
  40 CScraperUrl::CScraperUrl(const CStdString& strUrl)
  41 {
  42   relevance = 0;
  43   ParseString(strUrl);
  44 }
  45
  46 CScraperUrl::CScraperUrl(const TiXmlElement* element)
  47 {
  48   relevance = 0;
  49   ParseElement(element);
  50 }
  51
  52 CScraperUrl::CScraperUrl()
  53 {
  54   relevance = 0;
  55 }
  56
  57 CScraperUrl::~CScraperUrl()
  58 {
  59 }
  60
  61 void CScraperUrl::Clear()
  62 {
  63   m_url.clear();
  64   m_spoof.clear();
  65   m_xml.clear();
  66   relevance = 0;
  67 }
  68
  69 bool CScraperUrl::Parse()
  70 {
  71   CStdString strToParse = m_xml;
  72   m_xml.clear();
  73   return ParseString(strToParse);
  74 }
  75
  76 bool CScraperUrl::ParseElement(const TiXmlElement* element)
  77 {
  78   if (!element || !element->FirstChild() ||
  79       !element->FirstChild()->Value()) return false;
  80
  81   stringstream stream;
  82   stream << *element;
  83   m_xml += stream.str();
  84
  85   SUrlEntry url;
  86   url.m_url = element->FirstChild()->Value();
  87   const char* pSpoof = element->Attribute("spoof");
  88   if (pSpoof)
  89     url.m_spoof = pSpoof;
  90   const char* szPost=element->Attribute("post");
  91   if (szPost && stricmp(szPost,"yes") == 0)
  92     url.m_post = true;
  93   else
  94     url.m_post = false;
  95   const char* szIsGz=element->Attribute("gzip");
  96   if (szIsGz && stricmp(szIsGz,"yes") == 0)
  97     url.m_isgz = true;
  98   else
  99     url.m_isgz = false;
 100   const char* pCache = element->Attribute("cache");
 101   if (pCache)
 102     url.m_cache = pCache;
 103
 104   const char* szType = element->Attribute("type");
 105   url.m_type = URL_TYPE_GENERAL;
 106   url.m_season = -1;
 107   if (szType && stricmp(szType,"season") == 0)
 108   {
 109     url.m_type = URL_TYPE_SEASON;
 110     const char* szSeason = element->Attribute("season");
 111     if (szSeason)
 112       url.m_season = atoi(szSeason);
 113   }
 114   const char *aspect = element->Attribute("aspect");
 115   if (aspect)
 116     url.m_aspect = aspect;
 117
 118   m_url.push_back(url);
 119
 120   return true;
 121 }
 122
 123 bool CScraperUrl::ParseString(CStdString strUrl)
 124 {
 125   if (strUrl.empty())
 126     return false;
 127
 128   CXBMCTinyXML doc;
 129   doc.Parse(strUrl, TIXML_ENCODING_UNKNOWN);
 130
 131   TiXmlElement* pElement = doc.RootElement();
 132   if (!pElement)
 133   {
 134     SUrlEntry url;
 135     url.m_url = strUrl;
 136     url.m_type = URL_TYPE_GENERAL;
 137     url.m_season = -1;
 138     url.m_post = false;
 139     url.m_isgz = false;
 140     m_url.push_back(url);
 141     m_xml = strUrl;
 142   }
 143   else
 144   {
 145     while (pElement)
 146     {
 147       ParseElement(pElement);
 148       pElement = pElement->NextSiblingElement(pElement->Value());
 149     }
 150   }
 151
 152   return true;
 153 }
 154
 155 const CScraperUrl::SUrlEntry CScraperUrl::GetFirstThumb(const std::string &type) const
 156 {
 157   for (vector<SUrlEntry>::const_iterator iter=m_url.begin();iter != m_url.end();++iter)
 158   {
 159     if (iter->m_type == URL_TYPE_GENERAL && (type.empty() || type == "thumb" || iter->m_aspect == type))
 160       return *iter;
 161   }
 162
 163   SUrlEntry result;
 164   result.m_type = URL_TYPE_GENERAL;
 165   result.m_post = false;
 166   result.m_isgz = false;
 167   result.m_season = -1;
 168   return result;
 169 }
 170
 171 const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonThumb(int season, const std::string &type) const
 172 {
 173   for (vector<SUrlEntry>::const_iterator iter=m_url.begin();iter != m_url.end();++iter)
 174   {
 175     if (iter->m_type == URL_TYPE_SEASON && iter->m_season == season &&
 176        (type.empty() || type == "thumb" || iter->m_aspect == type))
 177       return *iter;
 178   }
 179
 180   SUrlEntry result;
 181   result.m_type = URL_TYPE_GENERAL;
 182   result.m_post = false;
 183   result.m_isgz = false;
 184   result.m_season = -1;
 185   return result;
 186 }
 187
 188 unsigned int CScraperUrl::GetMaxSeasonThumb() const
 189 {
 190   unsigned int maxSeason = 0;
 191   for (vector<SUrlEntry>::const_iterator iter=m_url.begin();iter != m_url.end();++iter)
 192   {
 193     if (iter->m_type == URL_TYPE_SEASON && iter->m_season > 0 && (unsigned int)iter->m_season > maxSeason)
 194       maxSeason = iter->m_season;
 195   }
 196   return maxSeason;
 197 }
 198
 199 bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext)
 200 {
 201   CURL url(scrURL.m_url);
 202   http.SetReferer(scrURL.m_spoof);
 203   CStdString strCachePath;
 204
 205   if (scrURL.m_isgz)
 206     http.SetContentEncoding("gzip");
 207
 208   if (!scrURL.m_cache.empty())
 209   {
 210     strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
 211                               "scrapers/" + cacheContext + "/" + scrURL.m_cache);
 212     if (XFILE::CFile::Exists(strCachePath))
 213     {
 214       XFILE::CFile file;
 215       XFILE::auto_buffer buffer;
 216       if (file.LoadFile(strCachePath, buffer))
 217       {
 218         strHTML.assign(buffer.get(), buffer.length());
 219         return true;
 220       }
 221     }
 222   }
 223
 224   CStdString strHTML1(strHTML);
 225
 226   if (scrURL.m_post)
 227   {
 228     CStdString strOptions = url.GetOptions();
 229     strOptions = strOptions.substr(1);
 230     url.SetOptions("");
 231
 232     if (!http.Post(url.Get(), strOptions, strHTML1))
 233       return false;
 234   }
 235   else
 236     if (!http.Get(url.Get(), strHTML1))
 237       return false;
 238
 239   strHTML = strHTML1;
 240
 241   std::string mimeType(http.GetMimeType());
 242   CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
 243   if (ftype == CMime::FileTypeUnknown)
 244     ftype = CMime::GetFileTypeFromContent(strHTML);
 245
 246   if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
 247   {
 248     XFILE::CZipFile file;
 249     std::string strBuffer;
 250     int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
 251     if (iSize > 0)
 252     {
 253       strHTML = strBuffer;
 254       CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str());
 255     }
 256     else
 257       CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str());
 258   }
 259
 260   std::string reportedCharset(http.GetServerReportedCharset());
 261   if (ftype == CMime::FileTypeHtml)
 262   {
 263     std::string realHtmlCharset, converted;
 264     if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
 265       CLog::Log(LOGWARNING, "%s: Can't find precise charset for \"%s\", using \"%s\" as fallback", __FUNCTION__, scrURL.m_url.c_str(), realHtmlCharset.c_str());
 266     else
 267       CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, realHtmlCharset.c_str(), scrURL.m_url.c_str());
 268
 269     strHTML = converted;
 270   }
 271   else if (ftype == CMime::FileTypeXml)
 272   {
 273     CXBMCTinyXML xmlDoc;
 274     xmlDoc.Parse(strHTML, reportedCharset);
 275
 276     std::string realXmlCharset(xmlDoc.GetUsedCharset());
 277     if (!realXmlCharset.empty())
 278     {
 279       CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, realXmlCharset.c_str(), scrURL.m_url.c_str());
 280       std::string converted;
 281       g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
 282       strHTML = converted;
 283     }
 284   }
 285   else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
 286   {
 287     std::string realTextCharset, converted;
 288     CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
 289     strHTML = converted;
 290     if (reportedCharset != realTextCharset)
 291       CLog::Log(LOGWARNING, "%s: Using \"%s\" charset for \"%s\" instead of server reported \"%s\" charset", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str(), reportedCharset.c_str());
 292     else
 293       CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str());
 294   }
 295   else if (!reportedCharset.empty() && reportedCharset != "UTF-8")
 296   {
 297     CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, reportedCharset.c_str(), scrURL.m_url.c_str());
 298     std::string converted;
 299     g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
 300     strHTML = converted;
 301   }
 302   else
 303     CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str());
 304
 305   if (!scrURL.m_cache.empty())
 306   {
 307     CStdString strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
 308                               "scrapers/" + cacheContext + "/" + scrURL.m_cache);
 309     XFILE::CFile file;
 310     if (file.OpenForWrite(strCachePath,true))
 311       file.Write(strHTML.data(),strHTML.size());
 312     file.Close();
 313   }
 314   return true;
 315 }
 316
 317 // XML format is of strUrls is:
 318 // <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto)
 319 bool CScraperUrl::ParseEpisodeGuide(CStdString strUrls)
 320 {
 321   if (strUrls.empty())
 322     return false;
 323
 324   // ok, now parse the xml file
 325   CXBMCTinyXML doc;
 326   doc.Parse(strUrls, TIXML_ENCODING_UNKNOWN);
 327   if (doc.RootElement())
 328   {
 329     TiXmlHandle docHandle( &doc );
 330     TiXmlElement *link = docHandle.FirstChild("episodeguide").Element();
 331     if (link->FirstChildElement("url"))
 332     {
 333       for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url"))
 334         ParseElement(link);
 335     }
 336     else if (link->FirstChild() && link->FirstChild()->Value())
 337       ParseElement(link);
 338   }
 339   else
 340     return false;
 341
 342   return true;
 343 }
 344
 345 CStdString CScraperUrl::GetThumbURL(const CScraperUrl::SUrlEntry &entry)
 346 {
 347   if (entry.m_spoof.empty())
 348     return entry.m_url;
 349   CStdString spoof = entry.m_spoof;
 350   spoof = CURL::Encode(spoof);
 351   return entry.m_url + "|Referer=" + spoof;
 352 }
 353
 354 void CScraperUrl::GetThumbURLs(std::vector<CStdString> &thumbs, const std::string &type, int season) const
 355 {
 356   for (vector<SUrlEntry>::const_iterator iter = m_url.begin(); iter != m_url.end(); ++iter)
 357   {
 358     if (iter->m_aspect == type || type.empty() || type == "thumb" || iter->m_aspect.empty())
 359     {
 360       if ((iter->m_type == CScraperUrl::URL_TYPE_GENERAL && season == -1)
 361        || (iter->m_type == CScraperUrl::URL_TYPE_SEASON && iter->m_season == season))
 362       {
 363         thumbs.push_back(GetThumbURL(*iter));
 364       }
 365     }
 366   }
 367 }