--- /dev/null
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""Planet aggregator library.
+
+This package is a library for developing web sites or software that
+aggregate RSS, CDF and Atom feeds taken from elsewhere into a single,
+combined feed.
+"""
+
+__version__ = "2.0"
+__authors__ = [ "Scott James Remnant <scott@netsplit.com>",
+ "Jeff Waugh <jdub@perkypants.org>" ]
+__license__ = "Python"
+
+
+# Modules available without separate import
+import cache
+import feedparser
+import sanitize
+import htmltmpl
+import sgmllib
+try:
+ import logging
+except:
+ import compat_logging as logging
+
+# Limit the effect of "from planet import *"
+__all__ = ("cache", "feedparser", "htmltmpl", "logging",
+ "Planet", "Channel", "NewsItem")
+
+
+import os
+import md5
+import time
+import dbhash
+import re
+
+try:
+ from xml.sax.saxutils import escape
+except:
+ def escape(data):
+ return data.replace("&","&").replace(">",">").replace("<","<")
+
+# Version information (for generator headers)
+VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__)
+
+# Default User-Agent header to send when retreiving feeds
+USER_AGENT = VERSION + " " + feedparser.USER_AGENT
+
+# Default cache directory
+CACHE_DIRECTORY = "cache"
+
+# Default number of items to display from a new feed
+NEW_FEED_ITEMS = 10
+
+# Useful common date/time formats
+TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00"
+TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000"
+
+
+# Log instance to use here
+log = logging.getLogger("planet")
+try:
+ log.warning
+except:
+ log.warning = log.warn
+
+# Defaults for the template file config sections
+ENCODING = "utf-8"
+ITEMS_PER_PAGE = 60
+DAYS_PER_PAGE = 0
+OUTPUT_DIR = "output"
+DATE_FORMAT = "%B %d, %Y %I:%M %p"
+NEW_DATE_FORMAT = "%B %d, %Y"
+ACTIVITY_THRESHOLD = 0
+
+class stripHtml(sgmllib.SGMLParser):
+ "remove all tags from the data"
+ def __init__(self, data):
+ sgmllib.SGMLParser.__init__(self)
+ self.result=''
+ self.feed(data)
+ self.close()
+ def handle_data(self, data):
+ if data: self.result+=data
+
+def template_info(item, date_format):
+ """Produce a dictionary of template information."""
+ info = {}
+ for key in item.keys():
+ if item.key_type(key) == item.DATE:
+ date = item.get_as_date(key)
+ info[key] = time.strftime(date_format, date)
+ info[key + "_iso"] = time.strftime(TIMEFMT_ISO, date)
+ info[key + "_822"] = time.strftime(TIMEFMT_822, date)
+ else:
+ info[key] = item[key]
+ if 'title' in item.keys():
+ info['title_plain'] = stripHtml(info['title']).result
+
+ return info
+
+
+class Planet:
+ """A set of channels.
+
+ This class represents a set of channels for which the items will
+ be aggregated together into one combined feed.
+
+ Properties:
+ user_agent User-Agent header to fetch feeds with.
+ cache_directory Directory to store cached channels in.
+ new_feed_items Number of items to display from a new feed.
+ filter A regular expression that articles must match.
+ exclude A regular expression that articles must not match.
+ """
+ def __init__(self, config):
+ self.config = config
+
+ self._channels = []
+
+ self.user_agent = USER_AGENT
+ self.cache_directory = CACHE_DIRECTORY
+ self.new_feed_items = NEW_FEED_ITEMS
+ self.filter = None
+ self.exclude = None
+
+ def tmpl_config_get(self, template, option, default=None, raw=0, vars=None):
+ """Get a template value from the configuration, with a default."""
+ if self.config.has_option(template, option):
+ return self.config.get(template, option, raw=raw, vars=None)
+ elif self.config.has_option("Planet", option):
+ return self.config.get("Planet", option, raw=raw, vars=None)
+ else:
+ return default
+
+ def gather_channel_info(self, template_file="Planet"):
+ date_format = self.tmpl_config_get(template_file,
+ "date_format", DATE_FORMAT, raw=1)
+
+ activity_threshold = int(self.tmpl_config_get(template_file,
+ "activity_threshold",
+ ACTIVITY_THRESHOLD))
+
+ if activity_threshold:
+ activity_horizon = \
+ time.gmtime(time.time()-86400*activity_threshold)
+ else:
+ activity_horizon = 0
+
+ channels = {}
+ channels_list = []
+ for channel in self.channels(hidden=1):
+ channels[channel] = template_info(channel, date_format)
+ channels_list.append(channels[channel])
+
+ # identify inactive feeds
+ if activity_horizon:
+ latest = channel.items(sorted=1)
+ if len(latest)==0 or latest[0].date < activity_horizon:
+ channels[channel]["message"] = \
+ "no activity in %d days" % activity_threshold
+
+ # report channel level errors
+ if not channel.url_status: continue
+ status = int(channel.url_status)
+ if status == 403:
+ channels[channel]["message"] = "403: forbidden"
+ elif status == 404:
+ channels[channel]["message"] = "404: not found"
+ elif status == 408:
+ channels[channel]["message"] = "408: request timeout"
+ elif status == 410:
+ channels[channel]["message"] = "410: gone"
+ elif status == 500:
+ channels[channel]["message"] = "internal server error"
+ elif status >= 400:
+ channels[channel]["message"] = "http status %s" % status
+
+ return channels, channels_list
+
+ def gather_items_info(self, channels, template_file="Planet", channel_list=None):
+ items_list = []
+ prev_date = []
+ prev_channel = None
+
+ date_format = self.tmpl_config_get(template_file,
+ "date_format", DATE_FORMAT, raw=1)
+ items_per_page = int(self.tmpl_config_get(template_file,
+ "items_per_page", ITEMS_PER_PAGE))
+ days_per_page = int(self.tmpl_config_get(template_file,
+ "days_per_page", DAYS_PER_PAGE))
+ new_date_format = self.tmpl_config_get(template_file,
+ "new_date_format", NEW_DATE_FORMAT, raw=1)
+
+ for newsitem in self.items(max_items=items_per_page,
+ max_days=days_per_page,
+ channels=channel_list):
+ item_info = template_info(newsitem, date_format)
+ chan_info = channels[newsitem._channel]
+ for k, v in chan_info.items():
+ item_info["channel_" + k] = v
+
+ # Check for the start of a new day
+ if prev_date[:3] != newsitem.date[:3]:
+ prev_date = newsitem.date
+ item_info["new_date"] = time.strftime(new_date_format,
+ newsitem.date)
+
+ # Check for the start of a new channel
+ if item_info.has_key("new_date") \
+ or prev_channel != newsitem._channel:
+ prev_channel = newsitem._channel
+ item_info["new_channel"] = newsitem._channel.url
+
+ items_list.append(item_info)
+
+ return items_list
+
+ def run(self, planet_name, planet_link, template_files, offline = False):
+ log = logging.getLogger("planet.runner")
+
+ # Create a planet
+ log.info("Loading cached data")
+ if self.config.has_option("Planet", "cache_directory"):
+ self.cache_directory = self.config.get("Planet", "cache_directory")
+ if self.config.has_option("Planet", "new_feed_items"):
+ self.new_feed_items = int(self.config.get("Planet", "new_feed_items"))
+ self.user_agent = "%s +%s %s" % (planet_name, planet_link,
+ self.user_agent)
+ if self.config.has_option("Planet", "filter"):
+ self.filter = self.config.get("Planet", "filter")
+
+ # The other configuration blocks are channels to subscribe to
+ for feed_url in self.config.sections():
+ if feed_url == "Planet" or feed_url in template_files:
+ continue
+
+ # Create a channel, configure it and subscribe it
+ channel = Channel(self, feed_url)
+ self.subscribe(channel)
+
+ # Update it
+ try:
+ if not offline and not channel.url_status == '410':
+ channel.update()
+ except KeyboardInterrupt:
+ raise
+ except:
+ log.exception("Update of <%s> failed", feed_url)
+
+ def generate_all_files(self, template_files, planet_name,
+ planet_link, planet_feed, owner_name, owner_email):
+
+ log = logging.getLogger("planet.runner")
+ # Go-go-gadget-template
+ for template_file in template_files:
+ manager = htmltmpl.TemplateManager()
+ log.info("Processing template %s", template_file)
+ try:
+ template = manager.prepare(template_file)
+ except htmltmpl.TemplateError:
+ template = manager.prepare(os.path.basename(template_file))
+ # Read the configuration
+ output_dir = self.tmpl_config_get(template_file,
+ "output_dir", OUTPUT_DIR)
+ date_format = self.tmpl_config_get(template_file,
+ "date_format", DATE_FORMAT, raw=1)
+ encoding = self.tmpl_config_get(template_file, "encoding", ENCODING)
+
+ # We treat each template individually
+ base = os.path.splitext(os.path.basename(template_file))[0]
+ url = os.path.join(planet_link, base)
+ output_file = os.path.join(output_dir, base)
+
+ # Gather information
+ channels, channels_list = self.gather_channel_info(template_file)
+ items_list = self.gather_items_info(channels, template_file)
+
+ # Gather item information
+
+ # Process the template
+ tp = htmltmpl.TemplateProcessor(html_escape=0)
+ tp.set("Items", items_list)
+ tp.set("Channels", channels_list)
+
+ # Generic information
+ tp.set("generator", VERSION)
+ tp.set("name", planet_name)
+ tp.set("link", planet_link)
+ tp.set("owner_name", owner_name)
+ tp.set("owner_email", owner_email)
+ tp.set("url", url)
+
+ if planet_feed:
+ tp.set("feed", planet_feed)
+ tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom')
+
+ # Update time
+ date = time.gmtime()
+ tp.set("date", time.strftime(date_format, date))
+ tp.set("date_iso", time.strftime(TIMEFMT_ISO, date))
+ tp.set("date_822", time.strftime(TIMEFMT_822, date))
+
+ try:
+ log.info("Writing %s", output_file)
+ output_fd = open(output_file, "w")
+ if encoding.lower() in ("utf-8", "utf8"):
+ # UTF-8 output is the default because we use that internally
+ output_fd.write(tp.process(template))
+ elif encoding.lower() in ("xml", "html", "sgml"):
+ # Magic for Python 2.3 users
+ output = tp.process(template).decode("utf-8")
+ output_fd.write(output.encode("ascii", "xmlcharrefreplace"))
+ else:
+ # Must be a "known" encoding
+ output = tp.process(template).decode("utf-8")
+ output_fd.write(output.encode(encoding, "replace"))
+ output_fd.close()
+ except KeyboardInterrupt:
+ raise
+ except:
+ log.exception("Write of %s failed", output_file)
+
+ def channels(self, hidden=0, sorted=1):
+ """Return the list of channels."""
+ channels = []
+ for channel in self._channels:
+ if hidden or not channel.has_key("hidden"):
+ channels.append((channel.name, channel))
+
+ if sorted:
+ channels.sort()
+
+ return [ c[-1] for c in channels ]
+
+ def find_by_basename(self, basename):
+ for channel in self._channels:
+ if basename == channel.cache_basename(): return channel
+
+ def subscribe(self, channel):
+ """Subscribe the planet to the channel."""
+ self._channels.append(channel)
+
+ def unsubscribe(self, channel):
+ """Unsubscribe the planet from the channel."""
+ self._channels.remove(channel)
+
+ def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None):
+ """Return an optionally filtered list of items in the channel.
+
+ The filters are applied in the following order:
+
+ If hidden is true then items in hidden channels and hidden items
+ will be returned.
+
+ If sorted is true then the item list will be sorted with the newest
+ first.
+
+ If max_items is non-zero then this number of items, at most, will
+ be returned.
+
+ If max_days is non-zero then any items older than the newest by
+ this number of days won't be returned. Requires sorted=1 to work.
+
+
+ The sharp-eyed will note that this looks a little strange code-wise,
+ it turns out that Python gets *really* slow if we try to sort the
+ actual items themselves. Also we use mktime here, but it's ok
+ because we discard the numbers and just need them to be relatively
+ consistent between each other.
+ """
+ planet_filter_re = None
+ if self.filter:
+ planet_filter_re = re.compile(self.filter, re.I)
+ planet_exclude_re = None
+ if self.exclude:
+ planet_exclude_re = re.compile(self.exclude, re.I)
+
+ items = []
+ seen_guids = {}
+ if not channels: channels=self.channels(hidden=hidden, sorted=0)
+ for channel in channels:
+ for item in channel._items.values():
+ if hidden or not item.has_key("hidden"):
+
+ channel_filter_re = None
+ if channel.filter:
+ channel_filter_re = re.compile(channel.filter,
+ re.I)
+ channel_exclude_re = None
+ if channel.exclude:
+ channel_exclude_re = re.compile(channel.exclude,
+ re.I)
+ if (planet_filter_re or planet_exclude_re \
+ or channel_filter_re or channel_exclude_re):
+ title = ""
+ if item.has_key("title"):
+ title = item.title
+ content = item.get_content("content")
+
+ if planet_filter_re:
+ if not (planet_filter_re.search(title) \
+ or planet_filter_re.search(content)):
+ continue
+
+ if planet_exclude_re:
+ if (planet_exclude_re.search(title) \
+ or planet_exclude_re.search(content)):
+ continue
+
+ if channel_filter_re:
+ if not (channel_filter_re.search(title) \
+ or channel_filter_re.search(content)):
+ continue
+
+ if channel_exclude_re:
+ if (channel_exclude_re.search(title) \
+ or channel_exclude_re.search(content)):
+ continue
+
+ if not seen_guids.has_key(item.id):
+ seen_guids[item.id] = 1;
+ items.append((time.mktime(item.date), item.order, item))
+
+ # Sort the list
+ if sorted:
+ items.sort()
+ items.reverse()
+
+ # Apply max_items filter
+ if len(items) and max_items:
+ items = items[:max_items]
+
+ # Apply max_days filter
+ if len(items) and max_days:
+ max_count = 0
+ max_time = items[0][0] - max_days * 84600
+ for item in items:
+ if item[0] > max_time:
+ max_count += 1
+ else:
+ items = items[:max_count]
+ break
+
+ return [ i[-1] for i in items ]
+
+class Channel(cache.CachedInfo):
+ """A list of news items.
+
+ This class represents a list of news items taken from the feed of
+ a website or other source.
+
+ Properties:
+ url URL of the feed.
+ url_etag E-Tag of the feed URL.
+ url_modified Last modified time of the feed URL.
+ url_status Last HTTP status of the feed URL.
+ hidden Channel should be hidden (True if exists).
+ name Name of the feed owner, or feed title.
+ next_order Next order number to be assigned to NewsItem
+
+ updated Correct UTC-Normalised update time of the feed.
+ last_updated Correct UTC-Normalised time the feed was last updated.
+
+ id An identifier the feed claims is unique (*).
+ title One-line title (*).
+ link Link to the original format feed (*).
+ tagline Short description of the feed (*).
+ info Longer description of the feed (*).
+
+ modified Date the feed claims to have been modified (*).
+
+ author Name of the author (*).
+ publisher Name of the publisher (*).
+ generator Name of the feed generator (*).
+ category Category name (*).
+ copyright Copyright information for humans to read (*).
+ license Link to the licence for the content (*).
+ docs Link to the specification of the feed format (*).
+ language Primary language (*).
+ errorreportsto E-Mail address to send error reports to (*).
+
+ image_url URL of an associated image (*).
+ image_link Link to go with the associated image (*).
+ image_title Alternative text of the associated image (*).
+ image_width Width of the associated image (*).
+ image_height Height of the associated image (*).
+
+ filter A regular expression that articles must match.
+ exclude A regular expression that articles must not match.
+
+ Properties marked (*) will only be present if the original feed
+ contained them. Note that the optional 'modified' date field is simply
+ a claim made by the item and parsed from the information given, 'updated'
+ (and 'last_updated') are far more reliable sources of information.
+
+ Some feeds may define additional properties to those above.
+ """
+ IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories",
+ "url", "href", "url_etag", "url_modified", "tags", "itunes_explicit")
+
+ def __init__(self, planet, url):
+ if not os.path.isdir(planet.cache_directory):
+ os.makedirs(planet.cache_directory)
+ cache_filename = cache.filename(planet.cache_directory, url)
+ cache_file = dbhash.open(cache_filename, "c", 0666)
+
+ cache.CachedInfo.__init__(self, cache_file, url, root=1)
+
+ self._items = {}
+ self._planet = planet
+ self._expired = []
+ self.url = url
+ # retain the original URL for error reporting
+ self.configured_url = url
+ self.url_etag = None
+ self.url_status = None
+ self.url_modified = None
+ self.name = None
+ self.updated = None
+ self.last_updated = None
+ self.filter = None
+ self.exclude = None
+ self.next_order = "0"
+ self.cache_read()
+ self.cache_read_entries()
+
+ if planet.config.has_section(url):
+ for option in planet.config.options(url):
+ value = planet.config.get(url, option)
+ self.set_as_string(option, value, cached=0)
+
+ def has_item(self, id_):
+ """Check whether the item exists in the channel."""
+ return self._items.has_key(id_)
+
+ def get_item(self, id_):
+ """Return the item from the channel."""
+ return self._items[id_]
+
+ # Special methods
+ __contains__ = has_item
+
+ def items(self, hidden=0, sorted=0):
+ """Return the item list."""
+ items = []
+ for item in self._items.values():
+ if hidden or not item.has_key("hidden"):
+ items.append((time.mktime(item.date), item.order, item))
+
+ if sorted:
+ items.sort()
+ items.reverse()
+
+ return [ i[-1] for i in items ]
+
+ def __iter__(self):
+ """Iterate the sorted item list."""
+ return iter(self.items(sorted=1))
+
+ def cache_read_entries(self):
+ """Read entry information from the cache."""
+ keys = self._cache.keys()
+ for key in keys:
+ if key.find(" ") != -1: continue
+ if self.has_key(key): continue
+
+ item = NewsItem(self, key)
+ self._items[key] = item
+
+ def cache_basename(self):
+ return cache.filename('',self._id)
+
+ def cache_write(self, sync=1):
+ """Write channel and item information to the cache."""
+ for item in self._items.values():
+ item.cache_write(sync=0)
+ for item in self._expired:
+ item.cache_clear(sync=0)
+ cache.CachedInfo.cache_write(self, sync)
+
+ self._expired = []
+
+ def feed_information(self):
+ """
+ Returns a description string for the feed embedded in this channel.
+
+ This will usually simply be the feed url embedded in <>, but in the
+ case where the current self.url has changed from the original
+ self.configured_url the string will contain both pieces of information.
+ This is so that the URL in question is easier to find in logging
+ output: getting an error about a URL that doesn't appear in your config
+ file is annoying.
+ """
+ if self.url == self.configured_url:
+ return "<%s>" % self.url
+ else:
+ return "<%s> (formerly <%s>)" % (self.url, self.configured_url)
+
+ def update(self):
+ """Download the feed to refresh the information.
+
+ This does the actual work of pulling down the feed and if it changes
+ updates the cached information about the feed and entries within it.
+ """
+ info = feedparser.parse(self.url,
+ etag=self.url_etag, modified=self.url_modified,
+ agent=self._planet.user_agent)
+ if info.has_key("status"):
+ self.url_status = str(info.status)
+ elif info.has_key("entries") and len(info.entries)>0:
+ self.url_status = str(200)
+ elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout':
+ self.url_status = str(408)
+ else:
+ self.url_status = str(500)
+
+ if self.url_status == '301' and \
+ (info.has_key("entries") and len(info.entries)>0):
+ log.warning("Feed has moved from <%s> to <%s>", self.url, info.url)
+ try:
+ os.link(cache.filename(self._planet.cache_directory, self.url),
+ cache.filename(self._planet.cache_directory, info.url))
+ except:
+ pass
+ self.url = info.url
+ elif self.url_status == '304':
+ log.info("Feed %s unchanged", self.feed_information())
+ return
+ elif self.url_status == '410':
+ log.info("Feed %s gone", self.feed_information())
+ self.cache_write()
+ return
+ elif self.url_status == '408':
+ log.warning("Feed %s timed out", self.feed_information())
+ return
+ elif int(self.url_status) >= 400:
+ log.error("Error %s while updating feed %s",
+ self.url_status, self.feed_information())
+ return
+ else:
+ log.info("Updating feed %s", self.feed_information())
+
+ self.url_etag = info.has_key("etag") and info.etag or None
+ self.url_modified = info.has_key("modified") and info.modified or None
+ if self.url_etag is not None:
+ log.debug("E-Tag: %s", self.url_etag)
+ if self.url_modified is not None:
+ log.debug("Last Modified: %s",
+ time.strftime(TIMEFMT_ISO, self.url_modified))
+
+ self.update_info(info.feed)
+ self.update_entries(info.entries)
+ self.cache_write()
+
+ def update_info(self, feed):
+ """Update information from the feed.
+
+ This reads the feed information supplied by feedparser and updates
+ the cached information about the feed. These are the various
+ potentially interesting properties that you might care about.
+ """
+ for key in feed.keys():
+ if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
+ # Ignored fields
+ pass
+ elif feed.has_key(key + "_parsed"):
+ # Ignore unparsed date fields
+ pass
+ elif key.endswith("_detail"):
+ # retain name and email sub-fields
+ if feed[key].has_key('name') and feed[key].name:
+ self.set_as_string(key.replace("_detail","_name"), \
+ feed[key].name)
+ if feed[key].has_key('email') and feed[key].email:
+ self.set_as_string(key.replace("_detail","_email"), \
+ feed[key].email)
+ elif key == "items":
+ # Ignore items field
+ pass
+ elif key.endswith("_parsed"):
+ # Date fields
+ if feed[key] is not None:
+ self.set_as_date(key[:-len("_parsed")], feed[key])
+ elif key == "image":
+ # Image field: save all the information
+ if feed[key].has_key("url"):
+ self.set_as_string(key + "_url", feed[key].url)
+ if feed[key].has_key("link"):
+ self.set_as_string(key + "_link", feed[key].link)
+ if feed[key].has_key("title"):
+ self.set_as_string(key + "_title", feed[key].title)
+ if feed[key].has_key("width"):
+ self.set_as_string(key + "_width", str(feed[key].width))
+ if feed[key].has_key("height"):
+ self.set_as_string(key + "_height", str(feed[key].height))
+ elif isinstance(feed[key], (str, unicode)):
+ # String fields
+ try:
+ detail = key + '_detail'
+ if feed.has_key(detail) and feed[detail].has_key('type'):
+ if feed[detail].type == 'text/html':
+ feed[key] = sanitize.HTML(feed[key])
+ elif feed[detail].type == 'text/plain':
+ feed[key] = escape(feed[key])
+ self.set_as_string(key, feed[key])
+ except KeyboardInterrupt:
+ raise
+ except:
+ log.exception("Ignored '%s' of <%s>, unknown format",
+ key, self.url)
+
+ def update_entries(self, entries):
+ """Update entries from the feed.
+
+ This reads the entries supplied by feedparser and updates the
+ cached information about them. It's at this point we update
+ the 'updated' timestamp and keep the old one in 'last_updated',
+ these provide boundaries for acceptable entry times.
+
+ If this is the first time a feed has been updated then most of the
+ items will be marked as hidden, according to Planet.new_feed_items.
+
+ If the feed does not contain items which, according to the sort order,
+ should be there; those items are assumed to have been expired from
+ the feed or replaced and are removed from the cache.
+ """
+ if not len(entries):
+ return
+
+ self.last_updated = self.updated
+ self.updated = time.gmtime()
+
+ new_items = []
+ feed_items = []
+ for entry in entries:
+ # Try really hard to find some kind of unique identifier
+ if entry.has_key("id"):
+ entry_id = cache.utf8(entry.id)
+ elif entry.has_key("link"):
+ entry_id = cache.utf8(entry.link)
+ elif entry.has_key("title"):
+ entry_id = (self.url + "/"
+ + md5.new(cache.utf8(entry.title)).hexdigest())
+ elif entry.has_key("summary"):
+ entry_id = (self.url + "/"
+ + md5.new(cache.utf8(entry.summary)).hexdigest())
+ else:
+ log.error("Unable to find or generate id, entry ignored")
+ continue
+
+ # Create the item if necessary and update
+ if self.has_item(entry_id):
+ item = self._items[entry_id]
+ else:
+ item = NewsItem(self, entry_id)
+ self._items[entry_id] = item
+ new_items.append(item)
+ item.update(entry)
+ feed_items.append(entry_id)
+
+ # Hide excess items the first time through
+ if self.last_updated is None and self._planet.new_feed_items \
+ and len(feed_items) > self._planet.new_feed_items:
+ item.hidden = "yes"
+ log.debug("Marked <%s> as hidden (new feed)", entry_id)
+
+ # Assign order numbers in reverse
+ new_items.reverse()
+ for item in new_items:
+ item.order = self.next_order = str(int(self.next_order) + 1)
+
+ # Check for expired or replaced items
+ feed_count = len(feed_items)
+ log.debug("Items in Feed: %d", feed_count)
+ for item in self.items(sorted=1):
+ if feed_count < 1:
+ break
+ elif item.id in feed_items:
+ feed_count -= 1
+ elif item._channel.url_status != '226':
+ del(self._items[item.id])
+ self._expired.append(item)
+ log.debug("Removed expired or replaced item <%s>", item.id)
+
+ def get_name(self, key):
+ """Return the key containing the name."""
+ for key in ("name", "title"):
+ if self.has_key(key) and self.key_type(key) != self.NULL:
+ return self.get_as_string(key)
+
+ return ""
+
+class NewsItem(cache.CachedInfo):
+ """An item of news.
+
+ This class represents a single item of news on a channel. They're
+ created by members of the Channel class and accessible through it.
+
+ Properties:
+ id Channel-unique identifier for this item.
+ id_hash Relatively short, printable cryptographic hash of id
+ date Corrected UTC-Normalised update time, for sorting.
+ order Order in which items on the same date can be sorted.
+ hidden Item should be hidden (True if exists).
+
+ title One-line title (*).
+ link Link to the original format text (*).
+ summary Short first-page summary (*).
+ content Full HTML content.
+
+ modified Date the item claims to have been modified (*).
+ issued Date the item claims to have been issued (*).
+ created Date the item claims to have been created (*).
+ expired Date the item claims to expire (*).
+
+ author Name of the author (*).
+ publisher Name of the publisher (*).
+ category Category name (*).
+ comments Link to a page to enter comments (*).
+ license Link to the licence for the content (*).
+ source_name Name of the original source of this item (*).
+ source_link Link to the original source of this item (*).
+
+ Properties marked (*) will only be present if the original feed
+ contained them. Note that the various optional date fields are
+ simply claims made by the item and parsed from the information
+ given, 'date' is a far more reliable source of information.
+
+ Some feeds may define additional properties to those above.
+ """
+ IGNORE_KEYS = ("categories", "contributors", "enclosures", "links",
+ "guidislink", "date", "tags")
+
+ def __init__(self, channel, id_):
+ cache.CachedInfo.__init__(self, channel._cache, id_)
+
+ self._channel = channel
+ self.id = id_
+ self.id_hash = md5.new(id_).hexdigest()
+ self.date = None
+ self.order = None
+ self.content = None
+ self.cache_read()
+
+ def update(self, entry):
+ """Update the item from the feedparser entry given."""
+ for key in entry.keys():
+ if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
+ # Ignored fields
+ pass
+ elif entry.has_key(key + "_parsed"):
+ # Ignore unparsed date fields
+ pass
+ elif key.endswith("_detail"):
+ # retain name, email, and language sub-fields
+ if entry[key].has_key('name') and entry[key].name:
+ self.set_as_string(key.replace("_detail","_name"), \
+ entry[key].name)
+ if entry[key].has_key('email') and entry[key].email:
+ self.set_as_string(key.replace("_detail","_email"), \
+ entry[key].email)
+ if entry[key].has_key('language') and entry[key].language and \
+ (not self._channel.has_key('language') or \
+ entry[key].language != self._channel.language):
+ self.set_as_string(key.replace("_detail","_language"), \
+ entry[key].language)
+ elif key.endswith("_parsed"):
+ # Date fields
+ if entry[key] is not None:
+ self.set_as_date(key[:-len("_parsed")], entry[key])
+ elif key == "source":
+ # Source field: save both url and value
+ if entry[key].has_key("value"):
+ self.set_as_string(key + "_name", entry[key].value)
+ if entry[key].has_key("url"):
+ self.set_as_string(key + "_link", entry[key].url)
+ elif key == "content":
+ # Content field: concatenate the values
+ value = ""
+ for item in entry[key]:
+ if item.type == 'text/html':
+ item.value = sanitize.HTML(item.value)
+ elif item.type == 'text/plain':
+ item.value = escape(item.value)
+ if item.has_key('language') and item.language and \
+ (not self._channel.has_key('language') or
+ item.language != self._channel.language) :
+ self.set_as_string(key + "_language", item.language)
+ value += cache.utf8(item.value)
+ self.set_as_string(key, value)
+ elif isinstance(entry[key], (str, unicode)):
+ # String fields
+ try:
+ detail = key + '_detail'
+ if entry.has_key(detail):
+ if entry[detail].has_key('type'):
+ if entry[detail].type == 'text/html':
+ entry[key] = sanitize.HTML(entry[key])
+ elif entry[detail].type == 'text/plain':
+ entry[key] = escape(entry[key])
+ self.set_as_string(key, entry[key])
+ except KeyboardInterrupt:
+ raise
+ except:
+ log.exception("Ignored '%s' of <%s>, unknown format",
+ key, self.id)
+
+ # Generate the date field if we need to
+ self.get_date("date")
+
+ def get_date(self, key):
+ """Get (or update) the date key.
+
+ We check whether the date the entry claims to have been changed is
+ since we last updated this feed and when we pulled the feed off the
+ site.
+
+ If it is then it's probably not bogus, and we'll sort accordingly.
+
+ If it isn't then we bound it appropriately, this ensures that
+ entries appear in posting sequence but don't overlap entries
+ added in previous updates and don't creep into the next one.
+ """
+
+ for other_key in ("updated", "modified", "published", "issued", "created"):
+ if self.has_key(other_key):
+ date = self.get_as_date(other_key)
+ break
+ else:
+ date = None
+
+ if date is not None:
+ if date > self._channel.updated:
+ date = self._channel.updated
+# elif date < self._channel.last_updated:
+# date = self._channel.updated
+ elif self.has_key(key) and self.key_type(key) != self.NULL:
+ return self.get_as_date(key)
+ else:
+ date = self._channel.updated
+
+ self.set_as_date(key, date)
+ return date
+
+ def get_content(self, key):
+ """Return the key containing the content."""
+ for key in ("content", "tagline", "summary"):
+ if self.has_key(key) and self.key_type(key) != self.NULL:
+ return self.get_as_string(key)
+
+ return ""