2 * Copyright (C) 2005-2013 Team XBMC
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with XBMC; see the file COPYING. If not, see
17 * <http://www.gnu.org/licenses/>.
22 #include "ScraperUrl.h"
23 #include "settings/AdvancedSettings.h"
25 #include "CharsetConverter.h"
26 #include "utils/CharsetDetection.h"
27 #include "utils/StringUtils.h"
29 #include "filesystem/CurlFile.h"
30 #include "filesystem/ZipFile.h"
32 #include "utils/XBMCTinyXML.h"
33 #include "utils/Mime.h"
40 CScraperUrl::CScraperUrl(const CStdString& strUrl)
46 CScraperUrl::CScraperUrl(const TiXmlElement* element)
49 ParseElement(element);
52 CScraperUrl::CScraperUrl()
57 CScraperUrl::~CScraperUrl()
61 void CScraperUrl::Clear()
69 bool CScraperUrl::Parse()
71 CStdString strToParse = m_xml;
73 return ParseString(strToParse);
76 bool CScraperUrl::ParseElement(const TiXmlElement* element)
78 if (!element || !element->FirstChild() ||
79 !element->FirstChild()->Value()) return false;
83 m_xml += stream.str();
86 url.m_url = element->FirstChild()->Value();
87 const char* pSpoof = element->Attribute("spoof");
90 const char* szPost=element->Attribute("post");
91 if (szPost && stricmp(szPost,"yes") == 0)
95 const char* szIsGz=element->Attribute("gzip");
96 if (szIsGz && stricmp(szIsGz,"yes") == 0)
100 const char* pCache = element->Attribute("cache");
102 url.m_cache = pCache;
104 const char* szType = element->Attribute("type");
105 url.m_type = URL_TYPE_GENERAL;
107 if (szType && stricmp(szType,"season") == 0)
109 url.m_type = URL_TYPE_SEASON;
110 const char* szSeason = element->Attribute("season");
112 url.m_season = atoi(szSeason);
114 const char *aspect = element->Attribute("aspect");
116 url.m_aspect = aspect;
118 m_url.push_back(url);
123 bool CScraperUrl::ParseString(CStdString strUrl)
129 doc.Parse(strUrl, TIXML_ENCODING_UNKNOWN);
131 TiXmlElement* pElement = doc.RootElement();
136 url.m_type = URL_TYPE_GENERAL;
140 m_url.push_back(url);
147 ParseElement(pElement);
148 pElement = pElement->NextSiblingElement(pElement->Value());
155 const CScraperUrl::SUrlEntry CScraperUrl::GetFirstThumb(const std::string &type) const
157 for (vector<SUrlEntry>::const_iterator iter=m_url.begin();iter != m_url.end();++iter)
159 if (iter->m_type == URL_TYPE_GENERAL && (type.empty() || type == "thumb" || iter->m_aspect == type))
164 result.m_type = URL_TYPE_GENERAL;
165 result.m_post = false;
166 result.m_isgz = false;
167 result.m_season = -1;
171 const CScraperUrl::SUrlEntry CScraperUrl::GetSeasonThumb(int season, const std::string &type) const
173 for (vector<SUrlEntry>::const_iterator iter=m_url.begin();iter != m_url.end();++iter)
175 if (iter->m_type == URL_TYPE_SEASON && iter->m_season == season &&
176 (type.empty() || type == "thumb" || iter->m_aspect == type))
181 result.m_type = URL_TYPE_GENERAL;
182 result.m_post = false;
183 result.m_isgz = false;
184 result.m_season = -1;
188 unsigned int CScraperUrl::GetMaxSeasonThumb() const
190 unsigned int maxSeason = 0;
191 for (vector<SUrlEntry>::const_iterator iter=m_url.begin();iter != m_url.end();++iter)
193 if (iter->m_type == URL_TYPE_SEASON && iter->m_season > 0 && (unsigned int)iter->m_season > maxSeason)
194 maxSeason = iter->m_season;
199 bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext)
201 CURL url(scrURL.m_url);
202 http.SetReferer(scrURL.m_spoof);
203 CStdString strCachePath;
206 http.SetContentEncoding("gzip");
208 if (!scrURL.m_cache.empty())
210 strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
211 "scrapers/" + cacheContext + "/" + scrURL.m_cache);
212 if (XFILE::CFile::Exists(strCachePath))
215 XFILE::auto_buffer buffer;
216 if (file.LoadFile(strCachePath, buffer))
218 strHTML.assign(buffer.get(), buffer.length());
224 CStdString strHTML1(strHTML);
228 CStdString strOptions = url.GetOptions();
229 strOptions = strOptions.substr(1);
232 if (!http.Post(url.Get(), strOptions, strHTML1))
236 if (!http.Get(url.Get(), strHTML1))
241 std::string mimeType(http.GetMimeType());
242 CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
243 if (ftype == CMime::FileTypeUnknown)
244 ftype = CMime::GetFileTypeFromContent(strHTML);
246 if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
248 XFILE::CZipFile file;
249 std::string strBuffer;
250 int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
254 CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str());
257 CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str());
260 std::string reportedCharset(http.GetServerReportedCharset());
261 if (ftype == CMime::FileTypeHtml)
263 std::string realHtmlCharset, converted;
264 if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
265 CLog::Log(LOGWARNING, "%s: Can't find precise charset for \"%s\", using \"%s\" as fallback", __FUNCTION__, scrURL.m_url.c_str(), realHtmlCharset.c_str());
267 CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, realHtmlCharset.c_str(), scrURL.m_url.c_str());
271 else if (ftype == CMime::FileTypeXml)
274 xmlDoc.Parse(strHTML, reportedCharset);
276 std::string realXmlCharset(xmlDoc.GetUsedCharset());
277 if (!realXmlCharset.empty())
279 CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, realXmlCharset.c_str(), scrURL.m_url.c_str());
280 std::string converted;
281 g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
285 else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
287 std::string realTextCharset, converted;
288 CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
290 if (reportedCharset != realTextCharset)
291 CLog::Log(LOGWARNING, "%s: Using \"%s\" charset for \"%s\" instead of server reported \"%s\" charset", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str(), reportedCharset.c_str());
293 CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str());
295 else if (!reportedCharset.empty() && reportedCharset != "UTF-8")
297 CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, reportedCharset.c_str(), scrURL.m_url.c_str());
298 std::string converted;
299 g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
303 CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str());
305 if (!scrURL.m_cache.empty())
307 CStdString strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
308 "scrapers/" + cacheContext + "/" + scrURL.m_cache);
310 if (file.OpenForWrite(strCachePath,true))
311 file.Write(strHTML.data(),strHTML.size());
317 // XML format is of strUrls is:
318 // <TAG><url>...</url>...</TAG> (parsed by ParseElement) or <url>...</url> (ditto)
319 bool CScraperUrl::ParseEpisodeGuide(CStdString strUrls)
324 // ok, now parse the xml file
326 doc.Parse(strUrls, TIXML_ENCODING_UNKNOWN);
327 if (doc.RootElement())
329 TiXmlHandle docHandle( &doc );
330 TiXmlElement *link = docHandle.FirstChild("episodeguide").Element();
331 if (link->FirstChildElement("url"))
333 for (link = link->FirstChildElement("url"); link; link = link->NextSiblingElement("url"))
336 else if (link->FirstChild() && link->FirstChild()->Value())
345 CStdString CScraperUrl::GetThumbURL(const CScraperUrl::SUrlEntry &entry)
347 if (entry.m_spoof.empty())
349 CStdString spoof = entry.m_spoof;
350 spoof = CURL::Encode(spoof);
351 return entry.m_url + "|Referer=" + spoof;
354 void CScraperUrl::GetThumbURLs(std::vector<CStdString> &thumbs, const std::string &type, int season) const
356 for (vector<SUrlEntry>::const_iterator iter = m_url.begin(); iter != m_url.end(); ++iter)
358 if (iter->m_aspect == type || type.empty() || type == "thumb" || iter->m_aspect.empty())
360 if ((iter->m_type == CScraperUrl::URL_TYPE_GENERAL && season == -1)
361 || (iter->m_type == CScraperUrl::URL_TYPE_SEASON && iter->m_season == season))
363 thumbs.push_back(GetThumbURL(*iter));