| #!/usr/bin/env python |
| # -*- coding: UTF-8 -*- |
| """Planet aggregator library. |
| |
| This package is a library for developing web sites or software that |
| aggregate RSS, CDF and Atom feeds taken from elsewhere into a single, |
| combined feed. |
| """ |
| |
| __version__ = "2.0" |
| __authors__ = [ "Scott James Remnant <scott@netsplit.com>", |
| "Jeff Waugh <jdub@perkypants.org>" ] |
| __license__ = "Python" |
| |
| |
| # Modules available without separate import |
| import cache |
| import feedparser |
| import sanitize |
| import htmltmpl |
| import sgmllib |
| try: |
| import logging |
| except: |
| import compat_logging as logging |
| |
| # Limit the effect of "from planet import *" |
| __all__ = ("cache", "feedparser", "htmltmpl", "logging", |
| "Planet", "Channel", "NewsItem") |
| |
| |
| import os |
| import md5 |
| import time |
| import dbhash |
| import re |
| |
| try: |
| from xml.sax.saxutils import escape |
| except: |
| def escape(data): |
| return data.replace("&","&").replace(">",">").replace("<","<") |
| |
| # Version information (for generator headers) |
| VERSION = ("Planet/%s +http://www.planetplanet.org" % __version__) |
| |
| # Default User-Agent header to send when retreiving feeds |
| USER_AGENT = VERSION + " " + feedparser.USER_AGENT |
| |
| # Default cache directory |
| CACHE_DIRECTORY = "cache" |
| |
| # Default number of items to display from a new feed |
| NEW_FEED_ITEMS = 10 |
| |
| # Useful common date/time formats |
| TIMEFMT_ISO = "%Y-%m-%dT%H:%M:%S+00:00" |
| TIMEFMT_822 = "%a, %d %b %Y %H:%M:%S +0000" |
| |
| |
| # Log instance to use here |
| log = logging.getLogger("planet") |
| try: |
| log.warning |
| except: |
| log.warning = log.warn |
| |
| # Defaults for the template file config sections |
| ENCODING = "utf-8" |
| ITEMS_PER_PAGE = 60 |
| DAYS_PER_PAGE = 0 |
| OUTPUT_DIR = "output" |
| DATE_FORMAT = "%B %d, %Y %I:%M %p" |
| NEW_DATE_FORMAT = "%B %d, %Y" |
| ACTIVITY_THRESHOLD = 0 |
| |
| class stripHtml(sgmllib.SGMLParser): |
| "remove all tags from the data" |
| def __init__(self, data): |
| sgmllib.SGMLParser.__init__(self) |
| self.result='' |
| self.feed(data) |
| self.close() |
| def handle_data(self, data): |
| if data: self.result+=data |
| |
| def template_info(item, date_format): |
| """Produce a dictionary of template information.""" |
| info = {} |
| for key in item.keys(): |
| if item.key_type(key) == item.DATE: |
| date = item.get_as_date(key) |
| info[key] = time.strftime(date_format, date) |
| info[key + "_iso"] = time.strftime(TIMEFMT_ISO, date) |
| info[key + "_822"] = time.strftime(TIMEFMT_822, date) |
| else: |
| info[key] = item[key] |
| if 'title' in item.keys(): |
| info['title_plain'] = stripHtml(info['title']).result |
| |
| return info |
| |
| |
| class Planet: |
| """A set of channels. |
| |
| This class represents a set of channels for which the items will |
| be aggregated together into one combined feed. |
| |
| Properties: |
| user_agent User-Agent header to fetch feeds with. |
| cache_directory Directory to store cached channels in. |
| new_feed_items Number of items to display from a new feed. |
| filter A regular expression that articles must match. |
| exclude A regular expression that articles must not match. |
| """ |
| def __init__(self, config): |
| self.config = config |
| |
| self._channels = [] |
| |
| self.user_agent = USER_AGENT |
| self.cache_directory = CACHE_DIRECTORY |
| self.new_feed_items = NEW_FEED_ITEMS |
| self.filter = None |
| self.exclude = None |
| |
| def tmpl_config_get(self, template, option, default=None, raw=0, vars=None): |
| """Get a template value from the configuration, with a default.""" |
| if self.config.has_option(template, option): |
| return self.config.get(template, option, raw=raw, vars=None) |
| elif self.config.has_option("Planet", option): |
| return self.config.get("Planet", option, raw=raw, vars=None) |
| else: |
| return default |
| |
| def gather_channel_info(self, template_file="Planet"): |
| date_format = self.tmpl_config_get(template_file, |
| "date_format", DATE_FORMAT, raw=1) |
| |
| activity_threshold = int(self.tmpl_config_get(template_file, |
| "activity_threshold", |
| ACTIVITY_THRESHOLD)) |
| |
| if activity_threshold: |
| activity_horizon = \ |
| time.gmtime(time.time()-86400*activity_threshold) |
| else: |
| activity_horizon = 0 |
| |
| channels = {} |
| channels_list = [] |
| for channel in self.channels(hidden=1): |
| channels[channel] = template_info(channel, date_format) |
| channels_list.append(channels[channel]) |
| |
| # identify inactive feeds |
| if activity_horizon: |
| latest = channel.items(sorted=1) |
| if len(latest)==0 or latest[0].date < activity_horizon: |
| channels[channel]["message"] = \ |
| "no activity in %d days" % activity_threshold |
| |
| # report channel level errors |
| if not channel.url_status: continue |
| status = int(channel.url_status) |
| if status == 403: |
| channels[channel]["message"] = "403: forbidden" |
| elif status == 404: |
| channels[channel]["message"] = "404: not found" |
| elif status == 408: |
| channels[channel]["message"] = "408: request timeout" |
| elif status == 410: |
| channels[channel]["message"] = "410: gone" |
| elif status == 500: |
| channels[channel]["message"] = "internal server error" |
| elif status >= 400: |
| channels[channel]["message"] = "http status %s" % status |
| |
| return channels, channels_list |
| |
| def gather_items_info(self, channels, template_file="Planet", channel_list=None): |
| items_list = [] |
| prev_date = [] |
| prev_channel = None |
| |
| date_format = self.tmpl_config_get(template_file, |
| "date_format", DATE_FORMAT, raw=1) |
| items_per_page = int(self.tmpl_config_get(template_file, |
| "items_per_page", ITEMS_PER_PAGE)) |
| days_per_page = int(self.tmpl_config_get(template_file, |
| "days_per_page", DAYS_PER_PAGE)) |
| new_date_format = self.tmpl_config_get(template_file, |
| "new_date_format", NEW_DATE_FORMAT, raw=1) |
| |
| for newsitem in self.items(max_items=items_per_page, |
| max_days=days_per_page, |
| channels=channel_list): |
| item_info = template_info(newsitem, date_format) |
| chan_info = channels[newsitem._channel] |
| for k, v in chan_info.items(): |
| item_info["channel_" + k] = v |
| |
| # Check for the start of a new day |
| if prev_date[:3] != newsitem.date[:3]: |
| prev_date = newsitem.date |
| item_info["new_date"] = time.strftime(new_date_format, |
| newsitem.date) |
| |
| # Check for the start of a new channel |
| if item_info.has_key("new_date") \ |
| or prev_channel != newsitem._channel: |
| prev_channel = newsitem._channel |
| item_info["new_channel"] = newsitem._channel.url |
| |
| items_list.append(item_info) |
| |
| return items_list |
| |
| def run(self, planet_name, planet_link, template_files, offline = False): |
| log = logging.getLogger("planet.runner") |
| |
| # Create a planet |
| log.info("Loading cached data") |
| if self.config.has_option("Planet", "cache_directory"): |
| self.cache_directory = self.config.get("Planet", "cache_directory") |
| if self.config.has_option("Planet", "new_feed_items"): |
| self.new_feed_items = int(self.config.get("Planet", "new_feed_items")) |
| self.user_agent = "%s +%s %s" % (planet_name, planet_link, |
| self.user_agent) |
| if self.config.has_option("Planet", "filter"): |
| self.filter = self.config.get("Planet", "filter") |
| |
| # The other configuration blocks are channels to subscribe to |
| for feed_url in self.config.sections(): |
| if feed_url == "Planet" or feed_url in template_files: |
| continue |
| |
| # Create a channel, configure it and subscribe it |
| channel = Channel(self, feed_url) |
| self.subscribe(channel) |
| |
| # Update it |
| try: |
| if not offline and not channel.url_status == '410': |
| channel.update() |
| except KeyboardInterrupt: |
| raise |
| except: |
| log.exception("Update of <%s> failed", feed_url) |
| |
| def generate_all_files(self, template_files, planet_name, |
| planet_link, planet_feed, owner_name, owner_email): |
| |
| log = logging.getLogger("planet.runner") |
| # Go-go-gadget-template |
| for template_file in template_files: |
| manager = htmltmpl.TemplateManager() |
| log.info("Processing template %s", template_file) |
| try: |
| template = manager.prepare(template_file) |
| except htmltmpl.TemplateError: |
| template = manager.prepare(os.path.basename(template_file)) |
| # Read the configuration |
| output_dir = self.tmpl_config_get(template_file, |
| "output_dir", OUTPUT_DIR) |
| date_format = self.tmpl_config_get(template_file, |
| "date_format", DATE_FORMAT, raw=1) |
| encoding = self.tmpl_config_get(template_file, "encoding", ENCODING) |
| |
| # We treat each template individually |
| base = os.path.splitext(os.path.basename(template_file))[0] |
| url = os.path.join(planet_link, base) |
| output_file = os.path.join(output_dir, base) |
| |
| # Gather information |
| channels, channels_list = self.gather_channel_info(template_file) |
| items_list = self.gather_items_info(channels, template_file) |
| |
| # Gather item information |
| |
| # Process the template |
| tp = htmltmpl.TemplateProcessor(html_escape=0) |
| tp.set("Items", items_list) |
| tp.set("Channels", channels_list) |
| |
| # Generic information |
| tp.set("generator", VERSION) |
| tp.set("name", planet_name) |
| tp.set("link", planet_link) |
| tp.set("owner_name", owner_name) |
| tp.set("owner_email", owner_email) |
| tp.set("url", url) |
| |
| if planet_feed: |
| tp.set("feed", planet_feed) |
| tp.set("feedtype", planet_feed.find('rss')>=0 and 'rss' or 'atom') |
| |
| # Update time |
| date = time.gmtime() |
| tp.set("date", time.strftime(date_format, date)) |
| tp.set("date_iso", time.strftime(TIMEFMT_ISO, date)) |
| tp.set("date_822", time.strftime(TIMEFMT_822, date)) |
| |
| try: |
| log.info("Writing %s", output_file) |
| output_fd = open(output_file, "w") |
| if encoding.lower() in ("utf-8", "utf8"): |
| # UTF-8 output is the default because we use that internally |
| output_fd.write(tp.process(template)) |
| elif encoding.lower() in ("xml", "html", "sgml"): |
| # Magic for Python 2.3 users |
| output = tp.process(template).decode("utf-8") |
| output_fd.write(output.encode("ascii", "xmlcharrefreplace")) |
| else: |
| # Must be a "known" encoding |
| output = tp.process(template).decode("utf-8") |
| output_fd.write(output.encode(encoding, "replace")) |
| output_fd.close() |
| except KeyboardInterrupt: |
| raise |
| except: |
| log.exception("Write of %s failed", output_file) |
| |
| def channels(self, hidden=0, sorted=1): |
| """Return the list of channels.""" |
| channels = [] |
| for channel in self._channels: |
| if hidden or not channel.has_key("hidden"): |
| channels.append((channel.name, channel)) |
| |
| if sorted: |
| channels.sort() |
| |
| return [ c[-1] for c in channels ] |
| |
| def find_by_basename(self, basename): |
| for channel in self._channels: |
| if basename == channel.cache_basename(): return channel |
| |
| def subscribe(self, channel): |
| """Subscribe the planet to the channel.""" |
| self._channels.append(channel) |
| |
| def unsubscribe(self, channel): |
| """Unsubscribe the planet from the channel.""" |
| self._channels.remove(channel) |
| |
| def items(self, hidden=0, sorted=1, max_items=0, max_days=0, channels=None): |
| """Return an optionally filtered list of items in the channel. |
| |
| The filters are applied in the following order: |
| |
| If hidden is true then items in hidden channels and hidden items |
| will be returned. |
| |
| If sorted is true then the item list will be sorted with the newest |
| first. |
| |
| If max_items is non-zero then this number of items, at most, will |
| be returned. |
| |
| If max_days is non-zero then any items older than the newest by |
| this number of days won't be returned. Requires sorted=1 to work. |
| |
| |
| The sharp-eyed will note that this looks a little strange code-wise, |
| it turns out that Python gets *really* slow if we try to sort the |
| actual items themselves. Also we use mktime here, but it's ok |
| because we discard the numbers and just need them to be relatively |
| consistent between each other. |
| """ |
| planet_filter_re = None |
| if self.filter: |
| planet_filter_re = re.compile(self.filter, re.I) |
| planet_exclude_re = None |
| if self.exclude: |
| planet_exclude_re = re.compile(self.exclude, re.I) |
| |
| items = [] |
| seen_guids = {} |
| if not channels: channels=self.channels(hidden=hidden, sorted=0) |
| for channel in channels: |
| for item in channel._items.values(): |
| if hidden or not item.has_key("hidden"): |
| |
| channel_filter_re = None |
| if channel.filter: |
| channel_filter_re = re.compile(channel.filter, |
| re.I) |
| channel_exclude_re = None |
| if channel.exclude: |
| channel_exclude_re = re.compile(channel.exclude, |
| re.I) |
| if (planet_filter_re or planet_exclude_re \ |
| or channel_filter_re or channel_exclude_re): |
| title = "" |
| if item.has_key("title"): |
| title = item.title |
| content = item.get_content("content") |
| |
| if planet_filter_re: |
| if not (planet_filter_re.search(title) \ |
| or planet_filter_re.search(content)): |
| continue |
| |
| if planet_exclude_re: |
| if (planet_exclude_re.search(title) \ |
| or planet_exclude_re.search(content)): |
| continue |
| |
| if channel_filter_re: |
| if not (channel_filter_re.search(title) \ |
| or channel_filter_re.search(content)): |
| continue |
| |
| if channel_exclude_re: |
| if (channel_exclude_re.search(title) \ |
| or channel_exclude_re.search(content)): |
| continue |
| |
| if not seen_guids.has_key(item.id): |
| seen_guids[item.id] = 1; |
| items.append((time.mktime(item.date), item.order, item)) |
| |
| # Sort the list |
| if sorted: |
| items.sort() |
| items.reverse() |
| |
| # Apply max_items filter |
| if len(items) and max_items: |
| items = items[:max_items] |
| |
| # Apply max_days filter |
| if len(items) and max_days: |
| max_count = 0 |
| max_time = items[0][0] - max_days * 84600 |
| for item in items: |
| if item[0] > max_time: |
| max_count += 1 |
| else: |
| items = items[:max_count] |
| break |
| |
| return [ i[-1] for i in items ] |
| |
| class Channel(cache.CachedInfo): |
| """A list of news items. |
| |
| This class represents a list of news items taken from the feed of |
| a website or other source. |
| |
| Properties: |
| url URL of the feed. |
| url_etag E-Tag of the feed URL. |
| url_modified Last modified time of the feed URL. |
| url_status Last HTTP status of the feed URL. |
| hidden Channel should be hidden (True if exists). |
| name Name of the feed owner, or feed title. |
| next_order Next order number to be assigned to NewsItem |
| |
| updated Correct UTC-Normalised update time of the feed. |
| last_updated Correct UTC-Normalised time the feed was last updated. |
| |
| id An identifier the feed claims is unique (*). |
| title One-line title (*). |
| link Link to the original format feed (*). |
| tagline Short description of the feed (*). |
| info Longer description of the feed (*). |
| |
| modified Date the feed claims to have been modified (*). |
| |
| author Name of the author (*). |
| publisher Name of the publisher (*). |
| generator Name of the feed generator (*). |
| category Category name (*). |
| copyright Copyright information for humans to read (*). |
| license Link to the licence for the content (*). |
| docs Link to the specification of the feed format (*). |
| language Primary language (*). |
| errorreportsto E-Mail address to send error reports to (*). |
| |
| image_url URL of an associated image (*). |
| image_link Link to go with the associated image (*). |
| image_title Alternative text of the associated image (*). |
| image_width Width of the associated image (*). |
| image_height Height of the associated image (*). |
| |
| filter A regular expression that articles must match. |
| exclude A regular expression that articles must not match. |
| |
| Properties marked (*) will only be present if the original feed |
| contained them. Note that the optional 'modified' date field is simply |
| a claim made by the item and parsed from the information given, 'updated' |
| (and 'last_updated') are far more reliable sources of information. |
| |
| Some feeds may define additional properties to those above. |
| """ |
| IGNORE_KEYS = ("links", "contributors", "textinput", "cloud", "categories", |
| "url", "href", "url_etag", "url_modified", "tags", "itunes_explicit") |
| |
| def __init__(self, planet, url): |
| if not os.path.isdir(planet.cache_directory): |
| os.makedirs(planet.cache_directory) |
| cache_filename = cache.filename(planet.cache_directory, url) |
| cache_file = dbhash.open(cache_filename, "c", 0666) |
| |
| cache.CachedInfo.__init__(self, cache_file, url, root=1) |
| |
| self._items = {} |
| self._planet = planet |
| self._expired = [] |
| self.url = url |
| # retain the original URL for error reporting |
| self.configured_url = url |
| self.url_etag = None |
| self.url_status = None |
| self.url_modified = None |
| self.name = None |
| self.updated = None |
| self.last_updated = None |
| self.filter = None |
| self.exclude = None |
| self.next_order = "0" |
| self.cache_read() |
| self.cache_read_entries() |
| |
| if planet.config.has_section(url): |
| for option in planet.config.options(url): |
| value = planet.config.get(url, option) |
| self.set_as_string(option, value, cached=0) |
| |
| def has_item(self, id_): |
| """Check whether the item exists in the channel.""" |
| return self._items.has_key(id_) |
| |
| def get_item(self, id_): |
| """Return the item from the channel.""" |
| return self._items[id_] |
| |
| # Special methods |
| __contains__ = has_item |
| |
| def items(self, hidden=0, sorted=0): |
| """Return the item list.""" |
| items = [] |
| for item in self._items.values(): |
| if hidden or not item.has_key("hidden"): |
| items.append((time.mktime(item.date), item.order, item)) |
| |
| if sorted: |
| items.sort() |
| items.reverse() |
| |
| return [ i[-1] for i in items ] |
| |
| def __iter__(self): |
| """Iterate the sorted item list.""" |
| return iter(self.items(sorted=1)) |
| |
| def cache_read_entries(self): |
| """Read entry information from the cache.""" |
| keys = self._cache.keys() |
| for key in keys: |
| if key.find(" ") != -1: continue |
| if self.has_key(key): continue |
| |
| item = NewsItem(self, key) |
| self._items[key] = item |
| |
| def cache_basename(self): |
| return cache.filename('',self._id) |
| |
| def cache_write(self, sync=1): |
| """Write channel and item information to the cache.""" |
| for item in self._items.values(): |
| item.cache_write(sync=0) |
| for item in self._expired: |
| item.cache_clear(sync=0) |
| cache.CachedInfo.cache_write(self, sync) |
| |
| self._expired = [] |
| |
| def feed_information(self): |
| """ |
| Returns a description string for the feed embedded in this channel. |
| |
| This will usually simply be the feed url embedded in <>, but in the |
| case where the current self.url has changed from the original |
| self.configured_url the string will contain both pieces of information. |
| This is so that the URL in question is easier to find in logging |
| output: getting an error about a URL that doesn't appear in your config |
| file is annoying. |
| """ |
| if self.url == self.configured_url: |
| return "<%s>" % self.url |
| else: |
| return "<%s> (formerly <%s>)" % (self.url, self.configured_url) |
| |
| def update(self): |
| """Download the feed to refresh the information. |
| |
| This does the actual work of pulling down the feed and if it changes |
| updates the cached information about the feed and entries within it. |
| """ |
| info = feedparser.parse(self.url, |
| etag=self.url_etag, modified=self.url_modified, |
| agent=self._planet.user_agent) |
| if info.has_key("status"): |
| self.url_status = str(info.status) |
| elif info.has_key("entries") and len(info.entries)>0: |
| self.url_status = str(200) |
| elif info.bozo and info.bozo_exception.__class__.__name__=='Timeout': |
| self.url_status = str(408) |
| else: |
| self.url_status = str(500) |
| |
| if self.url_status == '301' and \ |
| (info.has_key("entries") and len(info.entries)>0): |
| log.warning("Feed has moved from <%s> to <%s>", self.url, info.url) |
| try: |
| os.link(cache.filename(self._planet.cache_directory, self.url), |
| cache.filename(self._planet.cache_directory, info.url)) |
| except: |
| pass |
| self.url = info.url |
| elif self.url_status == '304': |
| log.info("Feed %s unchanged", self.feed_information()) |
| return |
| elif self.url_status == '410': |
| log.info("Feed %s gone", self.feed_information()) |
| self.cache_write() |
| return |
| elif self.url_status == '408': |
| log.warning("Feed %s timed out", self.feed_information()) |
| return |
| elif int(self.url_status) >= 400: |
| log.error("Error %s while updating feed %s", |
| self.url_status, self.feed_information()) |
| return |
| else: |
| log.info("Updating feed %s", self.feed_information()) |
| |
| self.url_etag = info.has_key("etag") and info.etag or None |
| self.url_modified = info.has_key("modified") and info.modified or None |
| if self.url_etag is not None: |
| log.debug("E-Tag: %s", self.url_etag) |
| if self.url_modified is not None: |
| log.debug("Last Modified: %s", |
| time.strftime(TIMEFMT_ISO, self.url_modified)) |
| |
| self.update_info(info.feed) |
| self.update_entries(info.entries) |
| self.cache_write() |
| |
| def update_info(self, feed): |
| """Update information from the feed. |
| |
| This reads the feed information supplied by feedparser and updates |
| the cached information about the feed. These are the various |
| potentially interesting properties that you might care about. |
| """ |
| for key in feed.keys(): |
| if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: |
| # Ignored fields |
| pass |
| elif feed.has_key(key + "_parsed"): |
| # Ignore unparsed date fields |
| pass |
| elif key.endswith("_detail"): |
| # retain name and email sub-fields |
| if feed[key].has_key('name') and feed[key].name: |
| self.set_as_string(key.replace("_detail","_name"), \ |
| feed[key].name) |
| if feed[key].has_key('email') and feed[key].email: |
| self.set_as_string(key.replace("_detail","_email"), \ |
| feed[key].email) |
| elif key == "items": |
| # Ignore items field |
| pass |
| elif key.endswith("_parsed"): |
| # Date fields |
| if feed[key] is not None: |
| self.set_as_date(key[:-len("_parsed")], feed[key]) |
| elif key == "image": |
| # Image field: save all the information |
| if feed[key].has_key("url"): |
| self.set_as_string(key + "_url", feed[key].url) |
| if feed[key].has_key("link"): |
| self.set_as_string(key + "_link", feed[key].link) |
| if feed[key].has_key("title"): |
| self.set_as_string(key + "_title", feed[key].title) |
| if feed[key].has_key("width"): |
| self.set_as_string(key + "_width", str(feed[key].width)) |
| if feed[key].has_key("height"): |
| self.set_as_string(key + "_height", str(feed[key].height)) |
| elif isinstance(feed[key], (str, unicode)): |
| # String fields |
| try: |
| detail = key + '_detail' |
| if feed.has_key(detail) and feed[detail].has_key('type'): |
| if feed[detail].type == 'text/html': |
| feed[key] = sanitize.HTML(feed[key]) |
| elif feed[detail].type == 'text/plain': |
| feed[key] = escape(feed[key]) |
| self.set_as_string(key, feed[key]) |
| except KeyboardInterrupt: |
| raise |
| except: |
| log.exception("Ignored '%s' of <%s>, unknown format", |
| key, self.url) |
| |
| def update_entries(self, entries): |
| """Update entries from the feed. |
| |
| This reads the entries supplied by feedparser and updates the |
| cached information about them. It's at this point we update |
| the 'updated' timestamp and keep the old one in 'last_updated', |
| these provide boundaries for acceptable entry times. |
| |
| If this is the first time a feed has been updated then most of the |
| items will be marked as hidden, according to Planet.new_feed_items. |
| |
| If the feed does not contain items which, according to the sort order, |
| should be there; those items are assumed to have been expired from |
| the feed or replaced and are removed from the cache. |
| """ |
| if not len(entries): |
| return |
| |
| self.last_updated = self.updated |
| self.updated = time.gmtime() |
| |
| new_items = [] |
| feed_items = [] |
| for entry in entries: |
| # Try really hard to find some kind of unique identifier |
| if entry.has_key("id"): |
| entry_id = cache.utf8(entry.id) |
| elif entry.has_key("link"): |
| entry_id = cache.utf8(entry.link) |
| elif entry.has_key("title"): |
| entry_id = (self.url + "/" |
| + md5.new(cache.utf8(entry.title)).hexdigest()) |
| elif entry.has_key("summary"): |
| entry_id = (self.url + "/" |
| + md5.new(cache.utf8(entry.summary)).hexdigest()) |
| else: |
| log.error("Unable to find or generate id, entry ignored") |
| continue |
| |
| # Create the item if necessary and update |
| if self.has_item(entry_id): |
| item = self._items[entry_id] |
| else: |
| item = NewsItem(self, entry_id) |
| self._items[entry_id] = item |
| new_items.append(item) |
| item.update(entry) |
| feed_items.append(entry_id) |
| |
| # Hide excess items the first time through |
| if self.last_updated is None and self._planet.new_feed_items \ |
| and len(feed_items) > self._planet.new_feed_items: |
| item.hidden = "yes" |
| log.debug("Marked <%s> as hidden (new feed)", entry_id) |
| |
| # Assign order numbers in reverse |
| new_items.reverse() |
| for item in new_items: |
| item.order = self.next_order = str(int(self.next_order) + 1) |
| |
| # Check for expired or replaced items |
| feed_count = len(feed_items) |
| log.debug("Items in Feed: %d", feed_count) |
| for item in self.items(sorted=1): |
| if feed_count < 1: |
| break |
| elif item.id in feed_items: |
| feed_count -= 1 |
| elif item._channel.url_status != '226': |
| del(self._items[item.id]) |
| self._expired.append(item) |
| log.debug("Removed expired or replaced item <%s>", item.id) |
| |
| def get_name(self, key): |
| """Return the key containing the name.""" |
| for key in ("name", "title"): |
| if self.has_key(key) and self.key_type(key) != self.NULL: |
| return self.get_as_string(key) |
| |
| return "" |
| |
| class NewsItem(cache.CachedInfo): |
| """An item of news. |
| |
| This class represents a single item of news on a channel. They're |
| created by members of the Channel class and accessible through it. |
| |
| Properties: |
| id Channel-unique identifier for this item. |
| id_hash Relatively short, printable cryptographic hash of id |
| date Corrected UTC-Normalised update time, for sorting. |
| order Order in which items on the same date can be sorted. |
| hidden Item should be hidden (True if exists). |
| |
| title One-line title (*). |
| link Link to the original format text (*). |
| summary Short first-page summary (*). |
| content Full HTML content. |
| |
| modified Date the item claims to have been modified (*). |
| issued Date the item claims to have been issued (*). |
| created Date the item claims to have been created (*). |
| expired Date the item claims to expire (*). |
| |
| author Name of the author (*). |
| publisher Name of the publisher (*). |
| category Category name (*). |
| comments Link to a page to enter comments (*). |
| license Link to the licence for the content (*). |
| source_name Name of the original source of this item (*). |
| source_link Link to the original source of this item (*). |
| |
| Properties marked (*) will only be present if the original feed |
| contained them. Note that the various optional date fields are |
| simply claims made by the item and parsed from the information |
| given, 'date' is a far more reliable source of information. |
| |
| Some feeds may define additional properties to those above. |
| """ |
| IGNORE_KEYS = ("categories", "contributors", "enclosures", "links", |
| "guidislink", "date", "tags") |
| |
| def __init__(self, channel, id_): |
| cache.CachedInfo.__init__(self, channel._cache, id_) |
| |
| self._channel = channel |
| self.id = id_ |
| self.id_hash = md5.new(id_).hexdigest() |
| self.date = None |
| self.order = None |
| self.content = None |
| self.cache_read() |
| |
| def update(self, entry): |
| """Update the item from the feedparser entry given.""" |
| for key in entry.keys(): |
| if key in self.IGNORE_KEYS or key + "_parsed" in self.IGNORE_KEYS: |
| # Ignored fields |
| pass |
| elif entry.has_key(key + "_parsed"): |
| # Ignore unparsed date fields |
| pass |
| elif key.endswith("_detail"): |
| # retain name, email, and language sub-fields |
| if entry[key].has_key('name') and entry[key].name: |
| self.set_as_string(key.replace("_detail","_name"), \ |
| entry[key].name) |
| if entry[key].has_key('email') and entry[key].email: |
| self.set_as_string(key.replace("_detail","_email"), \ |
| entry[key].email) |
| if entry[key].has_key('language') and entry[key].language and \ |
| (not self._channel.has_key('language') or \ |
| entry[key].language != self._channel.language): |
| self.set_as_string(key.replace("_detail","_language"), \ |
| entry[key].language) |
| elif key.endswith("_parsed"): |
| # Date fields |
| if entry[key] is not None: |
| self.set_as_date(key[:-len("_parsed")], entry[key]) |
| elif key == "source": |
| # Source field: save both url and value |
| if entry[key].has_key("value"): |
| self.set_as_string(key + "_name", entry[key].value) |
| if entry[key].has_key("url"): |
| self.set_as_string(key + "_link", entry[key].url) |
| elif key == "content": |
| # Content field: concatenate the values |
| value = "" |
| for item in entry[key]: |
| if item.type == 'text/html': |
| item.value = sanitize.HTML(item.value) |
| elif item.type == 'text/plain': |
| item.value = escape(item.value) |
| if item.has_key('language') and item.language and \ |
| (not self._channel.has_key('language') or |
| item.language != self._channel.language) : |
| self.set_as_string(key + "_language", item.language) |
| value += cache.utf8(item.value) |
| self.set_as_string(key, value) |
| elif isinstance(entry[key], (str, unicode)): |
| # String fields |
| try: |
| detail = key + '_detail' |
| if entry.has_key(detail): |
| if entry[detail].has_key('type'): |
| if entry[detail].type == 'text/html': |
| entry[key] = sanitize.HTML(entry[key]) |
| elif entry[detail].type == 'text/plain': |
| entry[key] = escape(entry[key]) |
| self.set_as_string(key, entry[key]) |
| except KeyboardInterrupt: |
| raise |
| except: |
| log.exception("Ignored '%s' of <%s>, unknown format", |
| key, self.id) |
| |
| # Generate the date field if we need to |
| self.get_date("date") |
| |
| def get_date(self, key): |
| """Get (or update) the date key. |
| |
| We check whether the date the entry claims to have been changed is |
| since we last updated this feed and when we pulled the feed off the |
| site. |
| |
| If it is then it's probably not bogus, and we'll sort accordingly. |
| |
| If it isn't then we bound it appropriately, this ensures that |
| entries appear in posting sequence but don't overlap entries |
| added in previous updates and don't creep into the next one. |
| """ |
| |
| for other_key in ("updated", "modified", "published", "issued", "created"): |
| if self.has_key(other_key): |
| date = self.get_as_date(other_key) |
| break |
| else: |
| date = None |
| |
| if date is not None: |
| if date > self._channel.updated: |
| date = self._channel.updated |
| # elif date < self._channel.last_updated: |
| # date = self._channel.updated |
| elif self.has_key(key) and self.key_type(key) != self.NULL: |
| return self.get_as_date(key) |
| else: |
| date = self._channel.updated |
| |
| self.set_as_date(key, date) |
| return date |
| |
| def get_content(self, key): |
| """Return the key containing the content.""" |
| for key in ("content", "tagline", "summary"): |
| if self.has_key(key) and self.key_type(key) != self.NULL: |
| return self.get_as_string(key) |
| |
| return "" |