Initial patch.

[vuplus_webkit] / Websites / planet.webkit.org / planet / planet / __init__.py
diff --git a/Websites/planet.webkit.org/planet/planet/__init__.py b/Websites/planet.webkit.org/planet/planet/__init__.py

new file mode 100644 (file)

index 0000000..929920b
--- /dev/null
+++ b/Websites/planet.webkit.org/planet/planet/__init__.py
@@ -0,0 +1,953 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+"""Planet aggregator library.
+
+This package is a library for developing web sites or software that
+aggregate RSS, CDF and Atom feeds taken from elsewhere into a single,
+combined feed.
+"""
+
+__version__ = "2.0"
+__authors__ = [ "Scott James Remnant <scott@netsplit.com>",
+                "Jeff Waugh <jdub@perkypants.org>" ]
+__license__ = "Python"
+
+
+# Modules available without separate import
+import cache
+import feedparser
+import sanitize
+import htmltmpl
+import sgmllib
+try:
+    import logging
+except:
+    import compat_logging as logging
+
+# Limit the effect of "from planet import *"
+__all__ = ("cache", "feedparser", "htmltmpl", "logging",
+           "Planet", "Channel", "NewsItem")
+
+
+import os
+import md5
+import time
+import dbhash
+import re
+
+try: 
+    from xml.sax.saxutils import escape
+except:
+    def escape(data):
+        return data.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")
+
+# Version information (for generator headers)
+VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__)
+
+# Default User-Agent header to send when retreiving feeds
+USER_AGENT = VERSION + " " + feedparser.USER_AGENT
+
+# Default cache directory
+CACHE_DIRECTORY = "cache"
+
+# Default number of items to display from a new feed
+NEW_FEED_ITEMS = 10
+
+# Useful common date/time formats
+TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00"
+TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000"
+
+
+# Log instance to use here
+log = logging.getLogger("planet")
+try:
+    log.warning
+except:
+    log.warning = log.warn
+
+# Defaults for the template file config sections
+ENCODING        = "utf-8"
+ITEMS_PER_PAGE  = 60
+DAYS_PER_PAGE   = 0
+OUTPUT_DIR      = "output"
+DATE_FORMAT     = "%B %d, %Y %I:%M %p"
+NEW_DATE_FORMAT = "%B %d, %Y"
+ACTIVITY_THRESHOLD = 0
+
+class stripHtml(sgmllib.SGMLParser):
+    "remove all tags from the data"
+    def __init__(self, data):
+        sgmllib.SGMLParser.__init__(self)
+        self.result=''
+        self.feed(data)
+        self.close()
+    def handle_data(self, data):
+        if data: self.result+=data
+
+def template_info(item, date_format):
+    """Produce a dictionary of template information."""
+    info = {}
+    for key in item.keys():
+        if item.key_type(key) == item.DATE:
+            date = item.get_as_date(key)
+            info[key] = time.strftime(date_format, date)
+            info[key + "_iso"] = time.strftime(TIMEFMT_ISO, date)
+            info[key + "_822"] = time.strftime(TIMEFMT_822, date)
+        else:
+            info[key] = item[key]
+    if 'title' in item.keys():
+        info['title_plain'] = stripHtml(info['title']).result
+
+    return info
+
+
+class Planet:
+    """A set of channels.
+
+    This class represents a set of channels for which the items will
+    be aggregated together into one combined feed.
+
+    Properties:
+        user_agent      User-Agent header to fetch feeds with.
+        cache_directory Directory to store cached channels in.
+        new_feed_items  Number of items to display from a new feed.
+        filter          A regular expression that articles must match.
+        exclude         A regular expression that articles must not match.
+    """
+    def __init__(self, config):
+        self.config = config
+
+        self._channels = []
+
+        self.user_agent = USER_AGENT
+        self.cache_directory = CACHE_DIRECTORY
+        self.new_feed_items = NEW_FEED_ITEMS
+        self.filter = None
+        self.exclude = None
+
+    def tmpl_config_get(self, template, option, default=None, raw=0, vars=None):
+        """Get a template value from the configuration, with a default."""
+        if self.config.has_option(template, option):
+            return self.config.get(template, option, raw=raw, vars=None)
+        elif self.config.has_option("Planet", option):
+            return self.config.get("Planet", option, raw=raw, vars=None)
+        else:
+            return default
+
+    def gather_channel_info(self, template_file="Planet"):
+        date_format = self.tmpl_config_get(template_file,
+                                      "date_format", DATE_FORMAT, raw=1)
+
+        activity_threshold = int(self.tmpl_config_get(template_file,
+                                            "activity_threshold",
+                                            ACTIVITY_THRESHOLD))
+
+        if activity_threshold:
+            activity_horizon = \
+                time.gmtime(time.time()-86400*activity_threshold)
+        else:
+            activity_horizon = 0
+
+        channels = {}
+        channels_list = []
+        for channel in self.channels(hidden=1):
+            channels[channel] = template_info(channel, date_format)
+            channels_list.append(channels[channel])
+
+            # identify inactive feeds
+            if activity_horizon:
+                latest = channel.items(sorted=1)
+                if len(latest)==0 or latest[0].date < activity_horizon:
+                    channels[channel]["message"] = \
+                        "no activity in %d days" % activity_threshold
+
+            # report channel level errors
+            if not channel.url_status: continue
+            status = int(channel.url_status)
+            if status == 403:
+               channels[channel]["message"] = "403: forbidden"
+            elif status == 404:
+               channels[channel]["message"] = "404: not found"
+            elif status == 408:
+               channels[channel]["message"] = "408: request timeout"
+            elif status == 410:
+               channels[channel]["message"] = "410: gone"
+            elif status == 500:
+               channels[channel]["message"] = "internal server error"
+            elif status >= 400:
+               channels[channel]["message"] = "http status %s" % status
+
+        return channels, channels_list
+
+    def gather_items_info(self, channels, template_file="Planet", channel_list=None):
+        items_list = []
+        prev_date = []
+        prev_channel = None
+
+        date_format = self.tmpl_config_get(template_file,
+                                      "date_format", DATE_FORMAT, raw=1)
+        items_per_page = int(self.tmpl_config_get(template_file,
+                                      "items_per_page", ITEMS_PER_PAGE))
+        days_per_page = int(self.tmpl_config_get(template_file,
+                                      "days_per_page", DAYS_PER_PAGE))
+        new_date_format = self.tmpl_config_get(template_file,
+                                      "new_date_format", NEW_DATE_FORMAT, raw=1)
+
+        for newsitem in self.items(max_items=items_per_page,
+                                   max_days=days_per_page,
+                                   channels=channel_list):
+            item_info = template_info(newsitem, date_format)
+            chan_info = channels[newsitem._channel]
+            for k, v in chan_info.items():
+                item_info["channel_" + k] = v
+    
+            # Check for the start of a new day
+            if prev_date[:3] != newsitem.date[:3]:
+                prev_date = newsitem.date
+                item_info["new_date"] = time.strftime(new_date_format,
+                                                      newsitem.date)
+    
+            # Check for the start of a new channel
+            if item_info.has_key("new_date") \
+                   or prev_channel != newsitem._channel:
+                prev_channel = newsitem._channel
+                item_info["new_channel"] = newsitem._channel.url
+    
+            items_list.append(item_info)
+
+        return items_list
+
+    def run(self, planet_name, planet_link, template_files, offline = False):
+        log = logging.getLogger("planet.runner")
+
+        # Create a planet
+        log.info("Loading cached data")
+        if self.config.has_option("Planet", "cache_directory"):
+            self.cache_directory = self.config.get("Planet", "cache_directory")
+        if self.config.has_option("Planet", "new_feed_items"):
+            self.new_feed_items  = int(self.config.get("Planet", "new_feed_items"))
+        self.user_agent = "%s +%s %s" % (planet_name, planet_link,
+                                              self.user_agent)
+        if self.config.has_option("Planet", "filter"):
+            self.filter = self.config.get("Planet", "filter")
+
+        # The other configuration blocks are channels to subscribe to
+        for feed_url in self.config.sections():
+            if feed_url == "Planet" or feed_url in template_files:
+                continue
+
+            # Create a channel, configure it and subscribe it
+            channel = Channel(self, feed_url)
+            self.subscribe(channel)
+
+            # Update it
+            try:
+                if not offline and not channel.url_status == '410':
+                    channel.update()
+            except KeyboardInterrupt:
+                raise
+            except:
+                log.exception("Update of <%s> failed", feed_url)
+
+    def generate_all_files(self, template_files, planet_name,
+                planet_link, planet_feed, owner_name, owner_email):
+        
+        log = logging.getLogger("planet.runner")
+        # Go-go-gadget-template
+        for template_file in template_files:
+            manager = htmltmpl.TemplateManager()
+            log.info("Processing template %s", template_file)
+            try:
+                template = manager.prepare(template_file)
+            except htmltmpl.TemplateError:
+                template = manager.prepare(os.path.basename(template_file))
+            # Read the configuration
+            output_dir = self.tmpl_config_get(template_file,
+                                         "output_dir", OUTPUT_DIR)
+            date_format = self.tmpl_config_get(template_file,
+                                          "date_format", DATE_FORMAT, raw=1)
+            encoding = self.tmpl_config_get(template_file, "encoding", ENCODING)
+        
+            # We treat each template individually
+            base = os.path.splitext(os.path.basename(template_file))[0]
+            url = os.path.join(planet_link, base)
+            output_file = os.path.join(output_dir, base)
+
+            # Gather information
+            channels, channels_list = self.gather_channel_info(template_file) 
+            items_list = self.gather_items_info(channels, template_file) 
+
+            # Gather item information
+    
+            # Process the template
+            tp = htmltmpl.TemplateProcessor(html_escape=0)
+            tp.set("Items", items_list)
+            tp.set("Channels", channels_list)
+        
+            # Generic information
+            tp.set("generator",   VERSION)
+            tp.set("name",        planet_name)
+            tp.set("link",        planet_link)
+            tp.set("owner_name",  owner_name)
+            tp.set("owner_email", owner_email)
+            tp.set("url",         url)
+        
+            if planet_feed:
+                tp.set("feed", planet_feed)
+                tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom')
+            
+            # Update time
+            date = time.gmtime()
+            tp.set("date",        time.strftime(date_format, date))
+            tp.set("date_iso",    time.strftime(TIMEFMT_ISO, date))
+            tp.set("date_822",    time.strftime(TIMEFMT_822, date))
+
+            try:
+                log.info("Writing %s", output_file)
+                output_fd = open(output_file, "w")
+                if encoding.lower() in ("utf-8", "utf8"):
+                    # UTF-8 output is the default because we use that internally
+                    output_fd.write(tp.process(template))
+                elif encoding.lower() in ("xml", "html", "sgml"):
+                    # Magic for Python 2.3 users
+                    output = tp.process(template).decode("utf-8")
+                    output_fd.write(output.encode("ascii", "xmlcharrefreplace"))
+                else:
+                    # Must be a "known" encoding
+                    output = tp.process(template).decode("utf-8")
+                    output_fd.write(output.encode(encoding, "replace"))
+                output_fd.close()
+            except KeyboardInterrupt:
+                raise
+            except:
+                log.exception("Write of %s failed", output_file)
+
+    def channels(self, hidden=0, sorted=1):
+        """Return the list of channels."""
+        channels = []
+        for channel in self._channels:
+            if hidden or not channel.has_key("hidden"):
+                channels.append((channel.name, channel))
+
+        if sorted:
+            channels.sort()
+
+        return [ c[-1] for c in channels ]
+
+    def find_by_basename(self, basename):
+        for channel in self._channels:
+            if basename == channel.cache_basename(): return channel
+
+    def subscribe(self, channel):
+        """Subscribe the planet to the channel."""
+        self._channels.append(channel)
+
+    def unsubscribe(self, channel):
+        """Unsubscribe the planet from the channel."""
+        self._channels.remove(channel)
+
+    def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None):
+        """Return an optionally filtered list of items in the channel.
+
+        The filters are applied in the following order:
+
+        If hidden is true then items in hidden channels and hidden items
+        will be returned.
+
+        If sorted is true then the item list will be sorted with the newest
+        first.
+
+        If max_items is non-zero then this number of items, at most, will
+        be returned.
+
+        If max_days is non-zero then any items older than the newest by
+        this number of days won't be returned.  Requires sorted=1 to work.
+
+
+        The sharp-eyed will note that this looks a little strange code-wise,
+        it turns out that Python gets *really* slow if we try to sort the
+        actual items themselves.  Also we use mktime here, but it's ok
+        because we discard the numbers and just need them to be relatively
+        consistent between each other.
+        """
+        planet_filter_re = None
+        if self.filter:
+            planet_filter_re = re.compile(self.filter, re.I)
+        planet_exclude_re = None
+        if self.exclude:
+            planet_exclude_re = re.compile(self.exclude, re.I)
+            
+        items = []
+        seen_guids = {}
+        if not channels: channels=self.channels(hidden=hidden, sorted=0)
+        for channel in channels:
+            for item in channel._items.values():
+                if hidden or not item.has_key("hidden"):
+
+                    channel_filter_re = None
+                    if channel.filter:
+                        channel_filter_re = re.compile(channel.filter,
+                                                       re.I)
+                    channel_exclude_re = None
+                    if channel.exclude:
+                        channel_exclude_re = re.compile(channel.exclude,
+                                                        re.I)
+                    if (planet_filter_re or planet_exclude_re \
+                        or channel_filter_re or channel_exclude_re):
+                        title = ""
+                        if item.has_key("title"):
+                            title = item.title
+                        content = item.get_content("content")
+
+                    if planet_filter_re:
+                        if not (planet_filter_re.search(title) \
+                                or planet_filter_re.search(content)):
+                            continue
+
+                    if planet_exclude_re:
+                        if (planet_exclude_re.search(title) \
+                            or planet_exclude_re.search(content)):
+                            continue
+
+                    if channel_filter_re:
+                        if not (channel_filter_re.search(title) \
+                                or channel_filter_re.search(content)):
+                            continue
+
+                    if channel_exclude_re:
+                        if (channel_exclude_re.search(title) \
+                            or channel_exclude_re.search(content)):
+                            continue
+
+                    if not seen_guids.has_key(item.id):
+                        seen_guids[item.id] = 1;
+                        items.append((time.mktime(item.date), item.order, item))
+
+        # Sort the list
+        if sorted:
+            items.sort()
+            items.reverse()
+
+        # Apply max_items filter
+        if len(items) and max_items:
+            items = items[:max_items]
+
+        # Apply max_days filter
+        if len(items) and max_days:
+            max_count = 0
+            max_time = items[0][0] - max_days * 84600
+            for item in items:
+                if item[0] > max_time:
+                    max_count += 1
+                else:
+                    items = items[:max_count]
+                    break
+
+        return [ i[-1] for i in items ]
+
+class Channel(cache.CachedInfo):
+    """A list of news items.
+
+    This class represents a list of news items taken from the feed of
+    a website or other source.
+
+    Properties:
+        url             URL of the feed.
+        url_etag        E-Tag of the feed URL.
+        url_modified    Last modified time of the feed URL.
+        url_status      Last HTTP status of the feed URL.
+        hidden          Channel should be hidden (True if exists).
+        name            Name of the feed owner, or feed title.
+        next_order      Next order number to be assigned to NewsItem
+
+        updated         Correct UTC-Normalised update time of the feed.
+        last_updated    Correct UTC-Normalised time the feed was last updated.
+
+        id              An identifier the feed claims is unique (*).
+        title           One-line title (*).
+        link            Link to the original format feed (*).
+        tagline         Short description of the feed (*).
+        info            Longer description of the feed (*).
+
+        modified        Date the feed claims to have been modified (*).
+
+        author          Name of the author (*).
+        publisher       Name of the publisher (*).
+        generator       Name of the feed generator (*).
+        category        Category name (*).
+        copyright       Copyright information for humans to read (*).
+        license         Link to the licence for the content (*).
+        docs            Link to the specification of the feed format (*).
+        language        Primary language (*).
+        errorreportsto  E-Mail address to send error reports to (*).
+
+        image_url       URL of an associated image (*).
+        image_link      Link to go with the associated image (*).
+        image_title     Alternative text of the associated image (*).
+        image_width     Width of the associated image (*).
+        image_height    Height of the associated image (*).
+
+        filter          A regular expression that articles must match.
+        exclude         A regular expression that articles must not match.
+
+    Properties marked (*) will only be present if the original feed
+    contained them.  Note that the optional 'modified' date field is simply
+    a claim made by the item and parsed from the information given, 'updated'
+    (and 'last_updated') are far more reliable sources of information.
+
+    Some feeds may define additional properties to those above.
+    """
+    IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories",
+                   "url", "href", "url_etag", "url_modified", "tags", "itunes_explicit")
+
+    def __init__(self, planet, url):
+        if not os.path.isdir(planet.cache_directory):
+            os.makedirs(planet.cache_directory)
+        cache_filename = cache.filename(planet.cache_directory, url)
+        cache_file = dbhash.open(cache_filename, "c", 0666)
+
+        cache.CachedInfo.__init__(self, cache_file, url, root=1)
+
+        self._items = {}
+        self._planet = planet
+        self._expired = []
+        self.url = url
+        # retain the original URL for error reporting
+        self.configured_url = url
+        self.url_etag = None
+        self.url_status = None
+        self.url_modified = None
+        self.name = None
+        self.updated = None
+        self.last_updated = None
+        self.filter = None
+        self.exclude = None
+        self.next_order = "0"
+        self.cache_read()
+        self.cache_read_entries()
+
+        if planet.config.has_section(url):
+            for option in planet.config.options(url):
+                value = planet.config.get(url, option)
+                self.set_as_string(option, value, cached=0)
+
+    def has_item(self, id_):
+        """Check whether the item exists in the channel."""
+        return self._items.has_key(id_)
+
+    def get_item(self, id_):
+        """Return the item from the channel."""
+        return self._items[id_]
+
+    # Special methods
+    __contains__ = has_item
+
+    def items(self, hidden=0, sorted=0):
+        """Return the item list."""
+        items = []
+        for item in self._items.values():
+            if hidden or not item.has_key("hidden"):
+                items.append((time.mktime(item.date), item.order, item))
+
+        if sorted:
+            items.sort()
+            items.reverse()
+
+        return [ i[-1] for i in items ]
+
+    def __iter__(self):
+        """Iterate the sorted item list."""
+        return iter(self.items(sorted=1))
+
+    def cache_read_entries(self):
+        """Read entry information from the cache."""
+        keys = self._cache.keys()
+        for key in keys:
+            if key.find(" ") != -1: continue
+            if self.has_key(key): continue
+
+            item = NewsItem(self, key)
+            self._items[key] = item
+
+    def cache_basename(self):
+        return cache.filename('',self._id)
+
+    def cache_write(self, sync=1):
+        """Write channel and item information to the cache."""
+        for item in self._items.values():
+            item.cache_write(sync=0)
+        for item in self._expired:
+            item.cache_clear(sync=0)
+        cache.CachedInfo.cache_write(self, sync)
+
+        self._expired = []
+
+    def feed_information(self):
+        """
+        Returns a description string for the feed embedded in this channel.
+
+        This will usually simply be the feed url embedded in <>, but in the
+        case where the current self.url has changed from the original
+        self.configured_url the string will contain both pieces of information.
+        This is so that the URL in question is easier to find in logging
+        output: getting an error about a URL that doesn't appear in your config
+        file is annoying.
+        """
+        if self.url == self.configured_url:
+            return "<%s>" % self.url
+        else:
+            return "<%s> (formerly <%s>)" % (self.url, self.configured_url)
+
+    def update(self):
+        """Download the feed to refresh the information.
+
+        This does the actual work of pulling down the feed and if it changes
+        updates the cached information about the feed and entries within it.
+        """
+        info = feedparser.parse(self.url,
+                                etag=self.url_etag, modified=self.url_modified,
+                                agent=self._planet.user_agent)
+        if info.has_key("status"):
+           self.url_status = str(info.status)
+        elif info.has_key("entries") and len(info.entries)>0:
+           self.url_status = str(200)
+        elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout':
+           self.url_status = str(408)
+        else:
+           self.url_status = str(500)
+
+        if self.url_status == '301' and \
+           (info.has_key("entries") and len(info.entries)>0):
+            log.warning("Feed has moved from <%s> to <%s>", self.url, info.url)
+            try:
+                os.link(cache.filename(self._planet.cache_directory, self.url),
+                        cache.filename(self._planet.cache_directory, info.url))
+            except:
+                pass
+            self.url = info.url
+        elif self.url_status == '304':
+            log.info("Feed %s unchanged", self.feed_information())
+            return
+        elif self.url_status == '410':
+            log.info("Feed %s gone", self.feed_information())
+            self.cache_write()
+            return
+        elif self.url_status == '408':
+            log.warning("Feed %s timed out", self.feed_information())
+            return
+        elif int(self.url_status) >= 400:
+            log.error("Error %s while updating feed %s",
+                      self.url_status, self.feed_information())
+            return
+        else:
+            log.info("Updating feed %s", self.feed_information())
+
+        self.url_etag = info.has_key("etag") and info.etag or None
+        self.url_modified = info.has_key("modified") and info.modified or None
+        if self.url_etag is not None:
+            log.debug("E-Tag: %s", self.url_etag)
+        if self.url_modified is not None:
+            log.debug("Last Modified: %s",
+                      time.strftime(TIMEFMT_ISO, self.url_modified))
+
+        self.update_info(info.feed)
+        self.update_entries(info.entries)
+        self.cache_write()
+
+    def update_info(self, feed):
+        """Update information from the feed.
+
+        This reads the feed information supplied by feedparser and updates
+        the cached information about the feed.  These are the various
+        potentially interesting properties that you might care about.
+        """
+        for key in feed.keys():
+            if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
+                # Ignored fields
+                pass
+            elif feed.has_key(key + "_parsed"):
+                # Ignore unparsed date fields
+                pass
+            elif key.endswith("_detail"):
+                # retain name and  email sub-fields
+                if feed[key].has_key('name') and feed[key].name:
+                    self.set_as_string(key.replace("_detail","_name"), \
+                        feed[key].name)
+                if feed[key].has_key('email') and feed[key].email:
+                    self.set_as_string(key.replace("_detail","_email"), \
+                        feed[key].email)
+            elif key == "items":
+                # Ignore items field
+                pass
+            elif key.endswith("_parsed"):
+                # Date fields
+                if feed[key] is not None:
+                    self.set_as_date(key[:-len("_parsed")], feed[key])
+            elif key == "image":
+                # Image field: save all the information
+                if feed[key].has_key("url"):
+                    self.set_as_string(key + "_url", feed[key].url)
+                if feed[key].has_key("link"):
+                    self.set_as_string(key + "_link", feed[key].link)
+                if feed[key].has_key("title"):
+                    self.set_as_string(key + "_title", feed[key].title)
+                if feed[key].has_key("width"):
+                    self.set_as_string(key + "_width", str(feed[key].width))
+                if feed[key].has_key("height"):
+                    self.set_as_string(key + "_height", str(feed[key].height))
+            elif isinstance(feed[key], (str, unicode)):
+                # String fields
+                try:
+                    detail = key + '_detail'
+                    if feed.has_key(detail) and feed[detail].has_key('type'):
+                        if feed[detail].type == 'text/html':
+                            feed[key] = sanitize.HTML(feed[key])
+                        elif feed[detail].type == 'text/plain':
+                            feed[key] = escape(feed[key])
+                    self.set_as_string(key, feed[key])
+                except KeyboardInterrupt:
+                    raise
+                except:
+                    log.exception("Ignored '%s' of <%s>, unknown format",
+                                  key, self.url)
+
+    def update_entries(self, entries):
+        """Update entries from the feed.
+
+        This reads the entries supplied by feedparser and updates the
+        cached information about them.  It's at this point we update
+        the 'updated' timestamp and keep the old one in 'last_updated',
+        these provide boundaries for acceptable entry times.
+
+        If this is the first time a feed has been updated then most of the
+        items will be marked as hidden, according to Planet.new_feed_items.
+
+        If the feed does not contain items which, according to the sort order,
+        should be there; those items are assumed to have been expired from
+        the feed or replaced and are removed from the cache.
+        """
+        if not len(entries):
+            return
+
+        self.last_updated = self.updated
+        self.updated = time.gmtime()
+
+        new_items = []
+        feed_items = []
+        for entry in entries:
+            # Try really hard to find some kind of unique identifier
+            if entry.has_key("id"):
+                entry_id = cache.utf8(entry.id)
+            elif entry.has_key("link"):
+                entry_id = cache.utf8(entry.link)
+            elif entry.has_key("title"):
+                entry_id = (self.url + "/"
+                            + md5.new(cache.utf8(entry.title)).hexdigest())
+            elif entry.has_key("summary"):
+                entry_id = (self.url + "/"
+                            + md5.new(cache.utf8(entry.summary)).hexdigest())
+            else:
+                log.error("Unable to find or generate id, entry ignored")
+                continue
+
+            # Create the item if necessary and update
+            if self.has_item(entry_id):
+                item = self._items[entry_id]
+            else:
+                item = NewsItem(self, entry_id)
+                self._items[entry_id] = item
+                new_items.append(item)
+            item.update(entry)
+            feed_items.append(entry_id)
+
+            # Hide excess items the first time through
+            if self.last_updated is None  and self._planet.new_feed_items \
+                   and len(feed_items) > self._planet.new_feed_items:
+                item.hidden = "yes"
+                log.debug("Marked <%s> as hidden (new feed)", entry_id)
+
+        # Assign order numbers in reverse
+        new_items.reverse()
+        for item in new_items:
+            item.order = self.next_order = str(int(self.next_order) + 1)
+
+        # Check for expired or replaced items
+        feed_count = len(feed_items)
+        log.debug("Items in Feed: %d", feed_count)
+        for item in self.items(sorted=1):
+            if feed_count < 1:
+                break
+            elif item.id in feed_items:
+                feed_count -= 1
+            elif item._channel.url_status != '226':
+                del(self._items[item.id])
+                self._expired.append(item)
+                log.debug("Removed expired or replaced item <%s>", item.id)
+
+    def get_name(self, key):
+        """Return the key containing the name."""
+        for key in ("name", "title"):
+            if self.has_key(key) and self.key_type(key) != self.NULL:
+                return self.get_as_string(key)
+
+        return ""
+
+class NewsItem(cache.CachedInfo):
+    """An item of news.
+
+    This class represents a single item of news on a channel.  They're
+    created by members of the Channel class and accessible through it.
+
+    Properties:
+        id              Channel-unique identifier for this item.
+        id_hash         Relatively short, printable cryptographic hash of id
+        date            Corrected UTC-Normalised update time, for sorting.
+        order           Order in which items on the same date can be sorted.
+        hidden          Item should be hidden (True if exists).
+
+        title           One-line title (*).
+        link            Link to the original format text (*).
+        summary         Short first-page summary (*).
+        content         Full HTML content.
+
+        modified        Date the item claims to have been modified (*).
+        issued          Date the item claims to have been issued (*).
+        created         Date the item claims to have been created (*).
+        expired         Date the item claims to expire (*).
+
+        author          Name of the author (*).
+        publisher       Name of the publisher (*).
+        category        Category name (*).
+        comments        Link to a page to enter comments (*).
+        license         Link to the licence for the content (*).
+        source_name     Name of the original source of this item (*).
+        source_link     Link to the original source of this item (*).
+
+    Properties marked (*) will only be present if the original feed
+    contained them.  Note that the various optional date fields are
+    simply claims made by the item and parsed from the information
+    given, 'date' is a far more reliable source of information.
+
+    Some feeds may define additional properties to those above.
+    """
+    IGNORE_KEYS = ("categories", "contributors", "enclosures", "links",
+                   "guidislink", "date", "tags")
+
+    def __init__(self, channel, id_):
+        cache.CachedInfo.__init__(self, channel._cache, id_)
+
+        self._channel = channel
+        self.id = id_
+        self.id_hash = md5.new(id_).hexdigest()
+        self.date = None
+        self.order = None
+        self.content = None
+        self.cache_read()
+
+    def update(self, entry):
+        """Update the item from the feedparser entry given."""
+        for key in entry.keys():
+            if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
+                # Ignored fields
+                pass
+            elif entry.has_key(key + "_parsed"):
+                # Ignore unparsed date fields
+                pass
+            elif key.endswith("_detail"):
+                # retain name, email, and language sub-fields
+                if entry[key].has_key('name') and entry[key].name:
+                    self.set_as_string(key.replace("_detail","_name"), \
+                        entry[key].name)
+                if entry[key].has_key('email') and entry[key].email:
+                    self.set_as_string(key.replace("_detail","_email"), \
+                        entry[key].email)
+                if entry[key].has_key('language') and entry[key].language and \
+                   (not self._channel.has_key('language') or \
+                   entry[key].language != self._channel.language):
+                    self.set_as_string(key.replace("_detail","_language"), \
+                        entry[key].language)
+            elif key.endswith("_parsed"):
+                # Date fields
+                if entry[key] is not None:
+                    self.set_as_date(key[:-len("_parsed")], entry[key])
+            elif key == "source":
+                # Source field: save both url and value
+                if entry[key].has_key("value"):
+                    self.set_as_string(key + "_name", entry[key].value)
+                if entry[key].has_key("url"):
+                    self.set_as_string(key + "_link", entry[key].url)
+            elif key == "content":
+                # Content field: concatenate the values
+                value = ""
+                for item in entry[key]:
+                    if item.type == 'text/html':
+                        item.value = sanitize.HTML(item.value)
+                    elif item.type == 'text/plain':
+                        item.value = escape(item.value)
+                    if item.has_key('language') and item.language and \
+                       (not self._channel.has_key('language') or
+                       item.language != self._channel.language) :
+                        self.set_as_string(key + "_language", item.language)
+                    value += cache.utf8(item.value)
+                self.set_as_string(key, value)
+            elif isinstance(entry[key], (str, unicode)):
+                # String fields
+                try:
+                    detail = key + '_detail'
+                    if entry.has_key(detail):
+                        if entry[detail].has_key('type'):
+                            if entry[detail].type == 'text/html':
+                                entry[key] = sanitize.HTML(entry[key])
+                            elif entry[detail].type == 'text/plain':
+                                entry[key] = escape(entry[key])
+                    self.set_as_string(key, entry[key])
+                except KeyboardInterrupt:
+                    raise
+                except:
+                    log.exception("Ignored '%s' of <%s>, unknown format",
+                                  key, self.id)
+
+        # Generate the date field if we need to
+        self.get_date("date")
+
+    def get_date(self, key):
+        """Get (or update) the date key.
+
+        We check whether the date the entry claims to have been changed is
+        since we last updated this feed and when we pulled the feed off the
+        site.
+
+        If it is then it's probably not bogus, and we'll sort accordingly.
+
+        If it isn't then we bound it appropriately, this ensures that
+        entries appear in posting sequence but don't overlap entries
+        added in previous updates and don't creep into the next one.
+        """
+
+        for other_key in ("updated", "modified", "published", "issued", "created"):
+            if self.has_key(other_key):
+                date = self.get_as_date(other_key)
+                break
+        else:
+            date = None
+
+        if date is not None:
+            if date > self._channel.updated:
+                date = self._channel.updated
+#            elif date < self._channel.last_updated:
+#                date = self._channel.updated
+        elif self.has_key(key) and self.key_type(key) != self.NULL:
+            return self.get_as_date(key)
+        else:
+            date = self._channel.updated
+
+        self.set_as_date(key, date)
+        return date
+
+    def get_content(self, key):
+        """Return the key containing the content."""
+        for key in ("content", "tagline", "summary"):
+            if self.has_key(key) and self.key_type(key) != self.NULL:
+                return self.get_as_string(key)
+
+        return ""