Websites/planet.webkit.org/planet/planet/__init__.py - WebKit - Git at Google

 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 """Planet aggregator library.

 This package is a library for developing web sites or software that
 aggregate RSS, CDF and Atom feeds taken from elsewhere into a single,
 combined feed.
 """

 __version__ = "2.0"
 __authors__ = [ "Scott James Remnant <scott@netsplit.com>",
                 "Jeff Waugh <jdub@perkypants.org>" ]
 __license__ = "Python"


 # Modules available without separate import
 import cache
 import feedparser
 import sanitize
 import htmltmpl
 import sgmllib
 try:
     import logging
 except:
     import compat_logging as logging

 # Limit the effect of "from planet import *"
 __all__ = ("cache", "feedparser", "htmltmpl", "logging",
            "Planet", "Channel", "NewsItem")


 import os
 import md5
 import time
 import dbhash
 import re

 try:
     from xml.sax.saxutils import escape
 except:
     def escape(data):
         return data.replace("&","&amp;").replace(">","&gt;").replace("<","&lt;")

 # Version information (for generator headers)
 VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__)

 # Default User-Agent header to send when retreiving feeds
 USER_AGENT = VERSION + " " + feedparser.USER_AGENT

 # Default cache directory
 CACHE_DIRECTORY = "cache"

 # Default number of items to display from a new feed
 NEW_FEED_ITEMS = 10

 # Useful common date/time formats
 TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00"
 TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000"


 # Log instance to use here
 log = logging.getLogger("planet")
 try:
     log.warning
 except:
     log.warning = log.warn

 # Defaults for the template file config sections
 ENCODING        = "utf-8"
 ITEMS_PER_PAGE  = 60
 DAYS_PER_PAGE   = 0
 OUTPUT_DIR      = "output"
 DATE_FORMAT     = "%B %d, %Y %I:%M %p"
 NEW_DATE_FORMAT = "%B %d, %Y"
 ACTIVITY_THRESHOLD = 0

 class stripHtml(sgmllib.SGMLParser):
     "remove all tags from the data"
     def __init__(self, data):
         sgmllib.SGMLParser.__init__(self)
         self.result=''
         self.feed(data)
         self.close()
     def handle_data(self, data):
         if data: self.result+=data

 def template_info(item, date_format):
     """Produce a dictionary of template information."""
     info = {}
     for key in item.keys():
         if item.key_type(key) == item.DATE:
             date = item.get_as_date(key)
             info[key] = time.strftime(date_format, date)
             info[key + "_iso"] = time.strftime(TIMEFMT_ISO, date)
             info[key + "_822"] = time.strftime(TIMEFMT_822, date)
         else:
             info[key] = item[key]
     if 'title' in item.keys():
         info['title_plain'] = stripHtml(info['title']).result

     return info


 class Planet:
     """A set of channels.

     This class represents a set of channels for which the items will
     be aggregated together into one combined feed.

     Properties:
         user_agent      User-Agent header to fetch feeds with.
         cache_directory Directory to store cached channels in.
         new_feed_items  Number of items to display from a new feed.
         filter          A regular expression that articles must match.
         exclude         A regular expression that articles must not match.
     """
     def __init__(self, config):
         self.config = config

         self._channels = []

         self.user_agent = USER_AGENT
         self.cache_directory = CACHE_DIRECTORY
         self.new_feed_items = NEW_FEED_ITEMS
         self.filter = None
         self.exclude = None

     def tmpl_config_get(self, template, option, default=None, raw=0, vars=None):
         """Get a template value from the configuration, with a default."""
         if self.config.has_option(template, option):
             return self.config.get(template, option, raw=raw, vars=None)
         elif self.config.has_option("Planet", option):
             return self.config.get("Planet", option, raw=raw, vars=None)
         else:
             return default

     def gather_channel_info(self, template_file="Planet"):
         date_format = self.tmpl_config_get(template_file,
                                       "date_format", DATE_FORMAT, raw=1)

         activity_threshold = int(self.tmpl_config_get(template_file,
                                             "activity_threshold",
                                             ACTIVITY_THRESHOLD))

         if activity_threshold:
             activity_horizon = \
                 time.gmtime(time.time()-86400*activity_threshold)
         else:
             activity_horizon = 0

         channels = {}
         channels_list = []
         for channel in self.channels(hidden=1):
             channels[channel] = template_info(channel, date_format)
             channels_list.append(channels[channel])

             # identify inactive feeds
             if activity_horizon:
                 latest = channel.items(sorted=1)
                 if len(latest)==0 or latest[0].date < activity_horizon:
                     channels[channel]["message"] = \
                         "no activity in %d days" % activity_threshold

             # report channel level errors
             if not channel.url_status: continue
             status = int(channel.url_status)
             if status == 403:
                channels[channel]["message"] = "403: forbidden"
             elif status == 404:
                channels[channel]["message"] = "404: not found"
             elif status == 408:
                channels[channel]["message"] = "408: request timeout"
             elif status == 410:
                channels[channel]["message"] = "410: gone"
             elif status == 500:
                channels[channel]["message"] = "internal server error"
             elif status >= 400:
                channels[channel]["message"] = "http status %s" % status

         return channels, channels_list

     def gather_items_info(self, channels, template_file="Planet", channel_list=None):
         items_list = []
         prev_date = []
         prev_channel = None

         date_format = self.tmpl_config_get(template_file,
                                       "date_format", DATE_FORMAT, raw=1)
         items_per_page = int(self.tmpl_config_get(template_file,
                                       "items_per_page", ITEMS_PER_PAGE))
         days_per_page = int(self.tmpl_config_get(template_file,
                                       "days_per_page", DAYS_PER_PAGE))
         new_date_format = self.tmpl_config_get(template_file,
                                       "new_date_format", NEW_DATE_FORMAT, raw=1)

         for newsitem in self.items(max_items=items_per_page,
                                    max_days=days_per_page,
                                    channels=channel_list):
             item_info = template_info(newsitem, date_format)
             chan_info = channels[newsitem._channel]
             for k, v in chan_info.items():
                 item_info["channel_" + k] = v

             # Check for the start of a new day
             if prev_date[:3] != newsitem.date[:3]:
                 prev_date = newsitem.date
                 item_info["new_date"] = time.strftime(new_date_format,
                                                       newsitem.date)

             # Check for the start of a new channel
             if item_info.has_key("new_date") \
                    or prev_channel != newsitem._channel:
                 prev_channel = newsitem._channel
                 item_info["new_channel"] = newsitem._channel.url

             items_list.append(item_info)

         return items_list

     def run(self, planet_name, planet_link, template_files, offline = False):
         log = logging.getLogger("planet.runner")

         # Create a planet
         log.info("Loading cached data")
         if self.config.has_option("Planet", "cache_directory"):
             self.cache_directory = self.config.get("Planet", "cache_directory")
         if self.config.has_option("Planet", "new_feed_items"):
             self.new_feed_items  = int(self.config.get("Planet", "new_feed_items"))
         self.user_agent = "%s +%s %s" % (planet_name, planet_link,
                                               self.user_agent)
         if self.config.has_option("Planet", "filter"):
             self.filter = self.config.get("Planet", "filter")

         # The other configuration blocks are channels to subscribe to
         for feed_url in self.config.sections():
             if feed_url == "Planet" or feed_url in template_files:
                 continue

             # Create a channel, configure it and subscribe it
             channel = Channel(self, feed_url)
             self.subscribe(channel)

             # Update it
             try:
                 if not offline and not channel.url_status == '410':
                     channel.update()
             except KeyboardInterrupt:
                 raise
             except:
                 log.exception("Update of <%s> failed", feed_url)

     def generate_all_files(self, template_files, planet_name,
                 planet_link, planet_feed, owner_name, owner_email):

         log = logging.getLogger("planet.runner")
         # Go-go-gadget-template
         for template_file in template_files:
             manager = htmltmpl.TemplateManager()
             log.info("Processing template %s", template_file)
             try:
                 template = manager.prepare(template_file)
             except htmltmpl.TemplateError:
                 template = manager.prepare(os.path.basename(template_file))
             # Read the configuration
             output_dir = self.tmpl_config_get(template_file,
                                          "output_dir", OUTPUT_DIR)
             date_format = self.tmpl_config_get(template_file,
                                           "date_format", DATE_FORMAT, raw=1)
             encoding = self.tmpl_config_get(template_file, "encoding", ENCODING)

             # We treat each template individually
             base = os.path.splitext(os.path.basename(template_file))[0]
             url = os.path.join(planet_link, base)
             output_file = os.path.join(output_dir, base)

             # Gather information
             channels, channels_list = self.gather_channel_info(template_file)
             items_list = self.gather_items_info(channels, template_file)

             # Gather item information

             # Process the template
             tp = htmltmpl.TemplateProcessor(html_escape=0)
             tp.set("Items", items_list)
             tp.set("Channels", channels_list)

             # Generic information
             tp.set("generator",   VERSION)
             tp.set("name",        planet_name)
             tp.set("link",        planet_link)
             tp.set("owner_name",  owner_name)
             tp.set("owner_email", owner_email)
             tp.set("url",         url)

             if planet_feed:
                 tp.set("feed", planet_feed)
                 tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom')

             # Update time
             date = time.gmtime()
             tp.set("date",        time.strftime(date_format, date))
             tp.set("date_iso",    time.strftime(TIMEFMT_ISO, date))
             tp.set("date_822",    time.strftime(TIMEFMT_822, date))

             try:
                 log.info("Writing %s", output_file)
                 output_fd = open(output_file, "w")
                 if encoding.lower() in ("utf-8", "utf8"):
                     # UTF-8 output is the default because we use that internally
                     output_fd.write(tp.process(template))
                 elif encoding.lower() in ("xml", "html", "sgml"):
                     # Magic for Python 2.3 users
                     output = tp.process(template).decode("utf-8")
                     output_fd.write(output.encode("ascii", "xmlcharrefreplace"))
                 else:
                     # Must be a "known" encoding
                     output = tp.process(template).decode("utf-8")
                     output_fd.write(output.encode(encoding, "replace"))
                 output_fd.close()
             except KeyboardInterrupt:
                 raise
             except:
                 log.exception("Write of %s failed", output_file)

     def channels(self, hidden=0, sorted=1):
         """Return the list of channels."""
         channels = []
         for channel in self._channels:
             if hidden or not channel.has_key("hidden"):
                 channels.append((channel.name, channel))

         if sorted:
             channels.sort()

         return [ c[-1] for c in channels ]

     def find_by_basename(self, basename):
         for channel in self._channels:
             if basename == channel.cache_basename(): return channel

     def subscribe(self, channel):
         """Subscribe the planet to the channel."""
         self._channels.append(channel)

     def unsubscribe(self, channel):
         """Unsubscribe the planet from the channel."""
         self._channels.remove(channel)

     def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None):
         """Return an optionally filtered list of items in the channel.

         The filters are applied in the following order:

         If hidden is true then items in hidden channels and hidden items
         will be returned.

         If sorted is true then the item list will be sorted with the newest
         first.

         If max_items is non-zero then this number of items, at most, will
         be returned.

         If max_days is non-zero then any items older than the newest by
         this number of days won't be returned.  Requires sorted=1 to work.


         The sharp-eyed will note that this looks a little strange code-wise,
         it turns out that Python gets *really* slow if we try to sort the
         actual items themselves.  Also we use mktime here, but it's ok
         because we discard the numbers and just need them to be relatively
         consistent between each other.
         """
         planet_filter_re = None
         if self.filter:
             planet_filter_re = re.compile(self.filter, re.I)
         planet_exclude_re = None
         if self.exclude:
             planet_exclude_re = re.compile(self.exclude, re.I)

         items = []
         seen_guids = {}
         if not channels: channels=self.channels(hidden=hidden, sorted=0)
         for channel in channels:
             for item in channel._items.values():
                 if hidden or not item.has_key("hidden"):

                     channel_filter_re = None
                     if channel.filter:
                         channel_filter_re = re.compile(channel.filter,
                                                        re.I)
                     channel_exclude_re = None
                     if channel.exclude:
                         channel_exclude_re = re.compile(channel.exclude,
                                                         re.I)
                     if (planet_filter_re or planet_exclude_re \
                         or channel_filter_re or channel_exclude_re):
                         title = ""
                         if item.has_key("title"):
                             title = item.title
                         content = item.get_content("content")

                     if planet_filter_re:
                         if not (planet_filter_re.search(title) \
                                 or planet_filter_re.search(content)):
                             continue

                     if planet_exclude_re:
                         if (planet_exclude_re.search(title) \
                             or planet_exclude_re.search(content)):
                             continue

                     if channel_filter_re:
                         if not (channel_filter_re.search(title) \
                                 or channel_filter_re.search(content)):
                             continue

                     if channel_exclude_re:
                         if (channel_exclude_re.search(title) \
                             or channel_exclude_re.search(content)):
                             continue

                     if not seen_guids.has_key(item.id):
                         seen_guids[item.id] = 1;
                         items.append((time.mktime(item.date), item.order, item))

         # Sort the list
         if sorted:
             items.sort()
             items.reverse()

         # Apply max_items filter
         if len(items) and max_items:
             items = items[:max_items]

         # Apply max_days filter
         if len(items) and max_days:
             max_count = 0
             max_time = items[0][0] - max_days * 84600
             for item in items:
                 if item[0] > max_time:
                     max_count += 1
                 else:
                     items = items[:max_count]
                     break

         return [ i[-1] for i in items ]

 class Channel(cache.CachedInfo):
     """A list of news items.

     This class represents a list of news items taken from the feed of
     a website or other source.

     Properties:
         url             URL of the feed.
         url_etag        E-Tag of the feed URL.
         url_modified    Last modified time of the feed URL.
         url_status      Last HTTP status of the feed URL.
         hidden          Channel should be hidden (True if exists).
         name            Name of the feed owner, or feed title.
         next_order      Next order number to be assigned to NewsItem

         updated         Correct UTC-Normalised update time of the feed.
         last_updated    Correct UTC-Normalised time the feed was last updated.

         id              An identifier the feed claims is unique (*).
         title           One-line title (*).
         link            Link to the original format feed (*).
         tagline         Short description of the feed (*).
         info            Longer description of the feed (*).

         modified        Date the feed claims to have been modified (*).

         author          Name of the author (*).
         publisher       Name of the publisher (*).
         generator       Name of the feed generator (*).
         category        Category name (*).
         copyright       Copyright information for humans to read (*).
         license         Link to the licence for the content (*).
         docs            Link to the specification of the feed format (*).
         language        Primary language (*).
         errorreportsto  E-Mail address to send error reports to (*).

         image_url       URL of an associated image (*).
         image_link      Link to go with the associated image (*).
         image_title     Alternative text of the associated image (*).
         image_width     Width of the associated image (*).
         image_height    Height of the associated image (*).

         filter          A regular expression that articles must match.
         exclude         A regular expression that articles must not match.

     Properties marked (*) will only be present if the original feed
     contained them.  Note that the optional 'modified' date field is simply
     a claim made by the item and parsed from the information given, 'updated'
     (and 'last_updated') are far more reliable sources of information.

     Some feeds may define additional properties to those above.
     """
     IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories",
                    "url", "href", "url_etag", "url_modified", "tags", "itunes_explicit")

     def __init__(self, planet, url):
         if not os.path.isdir(planet.cache_directory):
             os.makedirs(planet.cache_directory)
         cache_filename = cache.filename(planet.cache_directory, url)
         cache_file = dbhash.open(cache_filename, "c", 0666)

         cache.CachedInfo.__init__(self, cache_file, url, root=1)

         self._items = {}
         self._planet = planet
         self._expired = []
         self.url = url
         # retain the original URL for error reporting
         self.configured_url = url
         self.url_etag = None
         self.url_status = None
         self.url_modified = None
         self.name = None
         self.updated = None
         self.last_updated = None
         self.filter = None
         self.exclude = None
         self.next_order = "0"
         self.cache_read()
         self.cache_read_entries()

         if planet.config.has_section(url):
             for option in planet.config.options(url):
                 value = planet.config.get(url, option)
                 self.set_as_string(option, value, cached=0)

     def has_item(self, id_):
         """Check whether the item exists in the channel."""
         return self._items.has_key(id_)

     def get_item(self, id_):
         """Return the item from the channel."""
         return self._items[id_]

     # Special methods
     __contains__ = has_item

     def items(self, hidden=0, sorted=0):
         """Return the item list."""
         items = []
         for item in self._items.values():
             if hidden or not item.has_key("hidden"):
                 items.append((time.mktime(item.date), item.order, item))

         if sorted:
             items.sort()
             items.reverse()

         return [ i[-1] for i in items ]

     def __iter__(self):
         """Iterate the sorted item list."""
         return iter(self.items(sorted=1))

     def cache_read_entries(self):
         """Read entry information from the cache."""
         keys = self._cache.keys()
         for key in keys:
             if key.find(" ") != -1: continue
             if self.has_key(key): continue

             item = NewsItem(self, key)
             self._items[key] = item

     def cache_basename(self):
         return cache.filename('',self._id)

     def cache_write(self, sync=1):
         """Write channel and item information to the cache."""
         for item in self._items.values():
             item.cache_write(sync=0)
         for item in self._expired:
             item.cache_clear(sync=0)
         cache.CachedInfo.cache_write(self, sync)

         self._expired = []

     def feed_information(self):
         """
         Returns a description string for the feed embedded in this channel.

         This will usually simply be the feed url embedded in <>, but in the
         case where the current self.url has changed from the original
         self.configured_url the string will contain both pieces of information.
         This is so that the URL in question is easier to find in logging
         output: getting an error about a URL that doesn't appear in your config
         file is annoying.
         """
         if self.url == self.configured_url:
             return "<%s>" % self.url
         else:
             return "<%s> (formerly <%s>)" % (self.url, self.configured_url)

     def update(self):
         """Download the feed to refresh the information.

         This does the actual work of pulling down the feed and if it changes
         updates the cached information about the feed and entries within it.
         """
         info = feedparser.parse(self.url,
                                 etag=self.url_etag, modified=self.url_modified,
                                 agent=self._planet.user_agent)
         if info.has_key("status"):
            self.url_status = str(info.status)
         elif info.has_key("entries") and len(info.entries)>0:
            self.url_status = str(200)
         elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout':
            self.url_status = str(408)
         else:
            self.url_status = str(500)

         if self.url_status == '301' and \
            (info.has_key("entries") and len(info.entries)>0):
             log.warning("Feed has moved from <%s> to <%s>", self.url, info.url)
             try:
                 os.link(cache.filename(self._planet.cache_directory, self.url),
                         cache.filename(self._planet.cache_directory, info.url))
             except:
                 pass
             self.url = info.url
         elif self.url_status == '304':
             log.info("Feed %s unchanged", self.feed_information())
             return
         elif self.url_status == '410':
             log.info("Feed %s gone", self.feed_information())
             self.cache_write()
             return
         elif self.url_status == '408':
             log.warning("Feed %s timed out", self.feed_information())
             return
         elif int(self.url_status) >= 400:
             log.error("Error %s while updating feed %s",
                       self.url_status, self.feed_information())
             return
         else:
             log.info("Updating feed %s", self.feed_information())

         self.url_etag = info.has_key("etag") and info.etag or None
         self.url_modified = info.has_key("modified") and info.modified or None
         if self.url_etag is not None:
             log.debug("E-Tag: %s", self.url_etag)
         if self.url_modified is not None:
             log.debug("Last Modified: %s",
                       time.strftime(TIMEFMT_ISO, self.url_modified))

         self.update_info(info.feed)
         self.update_entries(info.entries)
         self.cache_write()

     def update_info(self, feed):
         """Update information from the feed.

         This reads the feed information supplied by feedparser and updates
         the cached information about the feed.  These are the various
         potentially interesting properties that you might care about.
         """
         for key in feed.keys():
             if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
                 # Ignored fields
                 pass
             elif feed.has_key(key + "_parsed"):
                 # Ignore unparsed date fields
                 pass
             elif key.endswith("_detail"):
                 # retain name and  email sub-fields
                 if feed[key].has_key('name') and feed[key].name:
                     self.set_as_string(key.replace("_detail","_name"), \
                         feed[key].name)
                 if feed[key].has_key('email') and feed[key].email:
                     self.set_as_string(key.replace("_detail","_email"), \
                         feed[key].email)
             elif key == "items":
                 # Ignore items field
                 pass
             elif key.endswith("_parsed"):
                 # Date fields
                 if feed[key] is not None:
                     self.set_as_date(key[:-len("_parsed")], feed[key])
             elif key == "image":
                 # Image field: save all the information
                 if feed[key].has_key("url"):
                     self.set_as_string(key + "_url", feed[key].url)
                 if feed[key].has_key("link"):
                     self.set_as_string(key + "_link", feed[key].link)
                 if feed[key].has_key("title"):
                     self.set_as_string(key + "_title", feed[key].title)
                 if feed[key].has_key("width"):
                     self.set_as_string(key + "_width", str(feed[key].width))
                 if feed[key].has_key("height"):
                     self.set_as_string(key + "_height", str(feed[key].height))
             elif isinstance(feed[key], (str, unicode)):
                 # String fields
                 try:
                     detail = key + '_detail'
                     if feed.has_key(detail) and feed[detail].has_key('type'):
                         if feed[detail].type == 'text/html':
                             feed[key] = sanitize.HTML(feed[key])
                         elif feed[detail].type == 'text/plain':
                             feed[key] = escape(feed[key])
                     self.set_as_string(key, feed[key])
                 except KeyboardInterrupt:
                     raise
                 except:
                     log.exception("Ignored '%s' of <%s>, unknown format",
                                   key, self.url)

     def update_entries(self, entries):
         """Update entries from the feed.

         This reads the entries supplied by feedparser and updates the
         cached information about them.  It's at this point we update
         the 'updated' timestamp and keep the old one in 'last_updated',
         these provide boundaries for acceptable entry times.

         If this is the first time a feed has been updated then most of the
         items will be marked as hidden, according to Planet.new_feed_items.

         If the feed does not contain items which, according to the sort order,
         should be there; those items are assumed to have been expired from
         the feed or replaced and are removed from the cache.
         """
         if not len(entries):
             return

         self.last_updated = self.updated
         self.updated = time.gmtime()

         new_items = []
         feed_items = []
         for entry in entries:
             # Try really hard to find some kind of unique identifier
             if entry.has_key("id"):
                 entry_id = cache.utf8(entry.id)
             elif entry.has_key("link"):
                 entry_id = cache.utf8(entry.link)
             elif entry.has_key("title"):
                 entry_id = (self.url + "/"
                             + md5.new(cache.utf8(entry.title)).hexdigest())
             elif entry.has_key("summary"):
                 entry_id = (self.url + "/"
                             + md5.new(cache.utf8(entry.summary)).hexdigest())
             else:
                 log.error("Unable to find or generate id, entry ignored")
                 continue

             # Create the item if necessary and update
             if self.has_item(entry_id):
                 item = self._items[entry_id]
             else:
                 item = NewsItem(self, entry_id)
                 self._items[entry_id] = item
                 new_items.append(item)
             item.update(entry)
             feed_items.append(entry_id)

             # Hide excess items the first time through
             if self.last_updated is None  and self._planet.new_feed_items \
                    and len(feed_items) > self._planet.new_feed_items:
                 item.hidden = "yes"
                 log.debug("Marked <%s> as hidden (new feed)", entry_id)

         # Assign order numbers in reverse
         new_items.reverse()
         for item in new_items:
             item.order = self.next_order = str(int(self.next_order) + 1)

         # Check for expired or replaced items
         feed_count = len(feed_items)
         log.debug("Items in Feed: %d", feed_count)
         for item in self.items(sorted=1):
             if feed_count < 1:
                 break
             elif item.id in feed_items:
                 feed_count -= 1
             elif item._channel.url_status != '226':
                 del(self._items[item.id])
                 self._expired.append(item)
                 log.debug("Removed expired or replaced item <%s>", item.id)

     def get_name(self, key):
         """Return the key containing the name."""
         for key in ("name", "title"):
             if self.has_key(key) and self.key_type(key) != self.NULL:
                 return self.get_as_string(key)

         return ""

 class NewsItem(cache.CachedInfo):
     """An item of news.

     This class represents a single item of news on a channel.  They're
     created by members of the Channel class and accessible through it.

     Properties:
         id              Channel-unique identifier for this item.
         id_hash         Relatively short, printable cryptographic hash of id
         date            Corrected UTC-Normalised update time, for sorting.
         order           Order in which items on the same date can be sorted.
         hidden          Item should be hidden (True if exists).

         title           One-line title (*).
         link            Link to the original format text (*).
         summary         Short first-page summary (*).
         content         Full HTML content.

         modified        Date the item claims to have been modified (*).
         issued          Date the item claims to have been issued (*).
         created         Date the item claims to have been created (*).
         expired         Date the item claims to expire (*).

         author          Name of the author (*).
         publisher       Name of the publisher (*).
         category        Category name (*).
         comments        Link to a page to enter comments (*).
         license         Link to the licence for the content (*).
         source_name     Name of the original source of this item (*).
         source_link     Link to the original source of this item (*).

     Properties marked (*) will only be present if the original feed
     contained them.  Note that the various optional date fields are
     simply claims made by the item and parsed from the information
     given, 'date' is a far more reliable source of information.

     Some feeds may define additional properties to those above.
     """
     IGNORE_KEYS = ("categories", "contributors", "enclosures", "links",
                    "guidislink", "date", "tags")

     def __init__(self, channel, id_):
         cache.CachedInfo.__init__(self, channel._cache, id_)

         self._channel = channel
         self.id = id_
         self.id_hash = md5.new(id_).hexdigest()
         self.date = None
         self.order = None
         self.content = None
         self.cache_read()

     def update(self, entry):
         """Update the item from the feedparser entry given."""
         for key in entry.keys():
             if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS:
                 # Ignored fields
                 pass
             elif entry.has_key(key + "_parsed"):
                 # Ignore unparsed date fields
                 pass
             elif key.endswith("_detail"):
                 # retain name, email, and language sub-fields
                 if entry[key].has_key('name') and entry[key].name:
                     self.set_as_string(key.replace("_detail","_name"), \
                         entry[key].name)
                 if entry[key].has_key('email') and entry[key].email:
                     self.set_as_string(key.replace("_detail","_email"), \
                         entry[key].email)
                 if entry[key].has_key('language') and entry[key].language and \
                    (not self._channel.has_key('language') or \
                    entry[key].language != self._channel.language):
                     self.set_as_string(key.replace("_detail","_language"), \
                         entry[key].language)
             elif key.endswith("_parsed"):
                 # Date fields
                 if entry[key] is not None:
                     self.set_as_date(key[:-len("_parsed")], entry[key])
             elif key == "source":
                 # Source field: save both url and value
                 if entry[key].has_key("value"):
                     self.set_as_string(key + "_name", entry[key].value)
                 if entry[key].has_key("url"):
                     self.set_as_string(key + "_link", entry[key].url)
             elif key == "content":
                 # Content field: concatenate the values
                 value = ""
                 for item in entry[key]:
                     if item.type == 'text/html':
                         item.value = sanitize.HTML(item.value)
                     elif item.type == 'text/plain':
                         item.value = escape(item.value)
                     if item.has_key('language') and item.language and \
                        (not self._channel.has_key('language') or
                        item.language != self._channel.language) :
                         self.set_as_string(key + "_language", item.language)
                     value += cache.utf8(item.value)
                 self.set_as_string(key, value)
             elif isinstance(entry[key], (str, unicode)):
                 # String fields
                 try:
                     detail = key + '_detail'
                     if entry.has_key(detail):
                         if entry[detail].has_key('type'):
                             if entry[detail].type == 'text/html':
                                 entry[key] = sanitize.HTML(entry[key])
                             elif entry[detail].type == 'text/plain':
                                 entry[key] = escape(entry[key])
                     self.set_as_string(key, entry[key])
                 except KeyboardInterrupt:
                     raise
                 except:
                     log.exception("Ignored '%s' of <%s>, unknown format",
                                   key, self.id)

         # Generate the date field if we need to
         self.get_date("date")

     def get_date(self, key):
         """Get (or update) the date key.

         We check whether the date the entry claims to have been changed is
         since we last updated this feed and when we pulled the feed off the
         site.

         If it is then it's probably not bogus, and we'll sort accordingly.

         If it isn't then we bound it appropriately, this ensures that
         entries appear in posting sequence but don't overlap entries
         added in previous updates and don't creep into the next one.
         """

         for other_key in ("updated", "modified", "published", "issued", "created"):
             if self.has_key(other_key):
                 date = self.get_as_date(other_key)
                 break
         else:
             date = None

         if date is not None:
             if date > self._channel.updated:
                 date = self._channel.updated
 #            elif date < self._channel.last_updated:
 #                date = self._channel.updated
         elif self.has_key(key) and self.key_type(key) != self.NULL:
             return self.get_as_date(key)
         else:
             date = self._channel.updated

         self.set_as_date(key, date)
         return date

     def get_content(self, key):
         """Return the key containing the content."""
         for key in ("content", "tagline", "summary"):
             if self.has_key(key) and self.key_type(key) != self.NULL:
                 return self.get_as_string(key)

         return ""