| #!/usr/bin/env python |
| # -*- coding: UTF-8 -*- |
| """Item cache. |
| |
| Between runs of Planet we need somewhere to store the feed information |
| we parsed, this is so we don't lose information when a particular feed |
| goes away or is too short to hold enough items. |
| |
| This module provides the code to handle this cache transparently enough |
| that the rest of the code can take the persistance for granted. |
| """ |
| |
| import os |
| import re |
| |
| |
| # Regular expressions to sanitise cache filenames |
| re_url_scheme = re.compile(r'^[^:]*://') |
| re_slash = re.compile(r'[?/]+') |
| re_initial_cruft = re.compile(r'^[,.]*') |
| re_final_cruft = re.compile(r'[,.]*$') |
| |
| |
| class CachedInfo: |
| """Cached information. |
| |
| This class is designed to hold information that is stored in a cache |
| between instances. It can act both as a dictionary (c['foo']) and |
| as an object (c.foo) to get and set values and supports both string |
| and date values. |
| |
| If you wish to support special fields you can derive a class off this |
| and implement get_FIELD and set_FIELD functions which will be |
| automatically called. |
| """ |
| STRING = "string" |
| DATE = "date" |
| NULL = "null" |
| |
| def __init__(self, cache, id_, root=0): |
| self._type = {} |
| self._value = {} |
| self._cached = {} |
| |
| self._cache = cache |
| self._id = id_.replace(" ", "%20") |
| self._root = root |
| |
| def cache_key(self, key): |
| """Return the cache key name for the given key.""" |
| key = key.replace(" ", "_") |
| if self._root: |
| return key |
| else: |
| return self._id + " " + key |
| |
| def cache_read(self): |
| """Read information from the cache.""" |
| if self._root: |
| keys_key = " keys" |
| else: |
| keys_key = self._id |
| |
| if self._cache.has_key(keys_key): |
| keys = self._cache[keys_key].split(" ") |
| else: |
| return |
| |
| for key in keys: |
| cache_key = self.cache_key(key) |
| if not self._cached.has_key(key) or self._cached[key]: |
| # Key either hasn't been loaded, or is one for the cache |
| self._value[key] = self._cache[cache_key] |
| self._type[key] = self._cache[cache_key + " type"] |
| self._cached[key] = 1 |
| |
| def cache_write(self, sync=1): |
| """Write information to the cache.""" |
| self.cache_clear(sync=0) |
| |
| keys = [] |
| for key in self.keys(): |
| cache_key = self.cache_key(key) |
| if not self._cached[key]: |
| if self._cache.has_key(cache_key): |
| # Non-cached keys need to be cleared |
| del(self._cache[cache_key]) |
| del(self._cache[cache_key + " type"]) |
| continue |
| |
| keys.append(key) |
| self._cache[cache_key] = self._value[key] |
| self._cache[cache_key + " type"] = self._type[key] |
| |
| if self._root: |
| keys_key = " keys" |
| else: |
| keys_key = self._id |
| |
| self._cache[keys_key] = " ".join(keys) |
| if sync: |
| self._cache.sync() |
| |
| def cache_clear(self, sync=1): |
| """Remove information from the cache.""" |
| if self._root: |
| keys_key = " keys" |
| else: |
| keys_key = self._id |
| |
| if self._cache.has_key(keys_key): |
| keys = self._cache[keys_key].split(" ") |
| del(self._cache[keys_key]) |
| else: |
| return |
| |
| for key in keys: |
| cache_key = self.cache_key(key) |
| del(self._cache[cache_key]) |
| del(self._cache[cache_key + " type"]) |
| |
| if sync: |
| self._cache.sync() |
| |
| def has_key(self, key): |
| """Check whether the key exists.""" |
| key = key.replace(" ", "_") |
| return self._value.has_key(key) |
| |
| def key_type(self, key): |
| """Return the key type.""" |
| key = key.replace(" ", "_") |
| return self._type[key] |
| |
| def set(self, key, value, cached=1): |
| """Set the value of the given key. |
| |
| If a set_KEY function exists that is called otherwise the |
| string function is called and the date function if that fails |
| (it nearly always will). |
| """ |
| key = key.replace(" ", "_") |
| |
| try: |
| func = getattr(self, "set_" + key) |
| except AttributeError: |
| pass |
| else: |
| return func(key, value) |
| |
| if value == None: |
| return self.set_as_null(key, value) |
| else: |
| try: |
| return self.set_as_string(key, value) |
| except TypeError: |
| return self.set_as_date(key, value) |
| |
| def get(self, key): |
| """Return the value of the given key. |
| |
| If a get_KEY function exists that is called otherwise the |
| correctly typed function is called if that exists. |
| """ |
| key = key.replace(" ", "_") |
| |
| try: |
| func = getattr(self, "get_" + key) |
| except AttributeError: |
| pass |
| else: |
| return func(key) |
| |
| try: |
| func = getattr(self, "get_as_" + self._type[key]) |
| except AttributeError: |
| pass |
| else: |
| return func(key) |
| |
| return self._value[key] |
| |
| def set_as_string(self, key, value, cached=1): |
| """Set the key to the string value. |
| |
| The value is converted to UTF-8 if it is a Unicode string, otherwise |
| it's assumed to have failed decoding (feedparser tries pretty hard) |
| so has all non-ASCII characters stripped. |
| """ |
| value = utf8(value) |
| |
| key = key.replace(" ", "_") |
| self._value[key] = value |
| self._type[key] = self.STRING |
| self._cached[key] = cached |
| |
| def get_as_string(self, key): |
| """Return the key as a string value.""" |
| key = key.replace(" ", "_") |
| if not self.has_key(key): |
| raise KeyError, key |
| |
| return self._value[key] |
| |
| def set_as_date(self, key, value, cached=1): |
| """Set the key to the date value. |
| |
| The date should be a 9-item tuple as returned by time.gmtime(). |
| """ |
| value = " ".join([ str(s) for s in value ]) |
| |
| key = key.replace(" ", "_") |
| self._value[key] = value |
| self._type[key] = self.DATE |
| self._cached[key] = cached |
| |
| def get_as_date(self, key): |
| """Return the key as a date value.""" |
| key = key.replace(" ", "_") |
| if not self.has_key(key): |
| raise KeyError, key |
| |
| value = self._value[key] |
| return tuple([ int(i) for i in value.split(" ") ]) |
| |
| def set_as_null(self, key, value, cached=1): |
| """Set the key to the null value. |
| |
| This only exists to make things less magic. |
| """ |
| key = key.replace(" ", "_") |
| self._value[key] = "" |
| self._type[key] = self.NULL |
| self._cached[key] = cached |
| |
| def get_as_null(self, key): |
| """Return the key as the null value.""" |
| key = key.replace(" ", "_") |
| if not self.has_key(key): |
| raise KeyError, key |
| |
| return None |
| |
| def del_key(self, key): |
| """Delete the given key.""" |
| key = key.replace(" ", "_") |
| if not self.has_key(key): |
| raise KeyError, key |
| |
| del(self._value[key]) |
| del(self._type[key]) |
| del(self._cached[key]) |
| |
| def keys(self): |
| """Return the list of cached keys.""" |
| return self._value.keys() |
| |
| def __iter__(self): |
| """Iterate the cached keys.""" |
| return iter(self._value.keys()) |
| |
| # Special methods |
| __contains__ = has_key |
| __setitem__ = set_as_string |
| __getitem__ = get |
| __delitem__ = del_key |
| __delattr__ = del_key |
| |
| def __setattr__(self, key, value): |
| if key.startswith("_"): |
| self.__dict__[key] = value |
| else: |
| self.set(key, value) |
| |
| def __getattr__(self, key): |
| if self.has_key(key): |
| return self.get(key) |
| else: |
| raise AttributeError, key |
| |
| |
| def filename(directory, filename): |
| """Return a filename suitable for the cache. |
| |
| Strips dangerous and common characters to create a filename we |
| can use to store the cache in. |
| """ |
| filename = re_url_scheme.sub("", filename) |
| filename = re_slash.sub(",", filename) |
| filename = re_initial_cruft.sub("", filename) |
| filename = re_final_cruft.sub("", filename) |
| |
| return os.path.join(directory, filename) |
| |
| def utf8(value): |
| """Return the value as a UTF-8 string.""" |
| if type(value) == type(u''): |
| return value.encode("utf-8") |
| else: |
| try: |
| return unicode(value, "utf-8").encode("utf-8") |
| except UnicodeError: |
| try: |
| return unicode(value, "iso-8859-1").encode("utf-8") |
| except UnicodeError: |
| return unicode(value, "ascii", "replace").encode("utf-8") |