| from os.path import dirname, join |
| |
| from collections import OrderedDict |
| |
| from xml.parsers import expat |
| import xml.etree.ElementTree as etree |
| |
| _catalog = join(dirname(__file__), "catalog") |
| |
| def _wrap_error(e): |
| err = etree.ParseError(e) |
| err.code = e.code |
| err.position = e.lineno, e.offset |
| raise err |
| |
| _names = {} |
| def _fixname(key): |
| try: |
| name = _names[key] |
| except KeyError: |
| name = key |
| if "}" in name: |
| name = "{" + name |
| _names[key] = name |
| return name |
| |
| |
| class XMLParser(object): |
| """ |
| An XML parser with support for XHTML DTDs and all Python-supported encodings |
| |
| This implements the API defined by |
| xml.etree.ElementTree.XMLParser, but supports XHTML DTDs |
| (therefore allowing XHTML entities) and supports all encodings |
| Python does, rather than just those supported by expat. |
| """ |
| def __init__(self, encoding=None): |
| self._parser = expat.ParserCreate(encoding, "}") |
| self._target = etree.TreeBuilder() |
| # parser settings |
| self._parser.buffer_text = 1 |
| self._parser.ordered_attributes = 1 |
| self._parser.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) |
| # parser callbacks |
| self._parser.XmlDeclHandler = self._xml_decl |
| self._parser.StartElementHandler = self._start |
| self._parser.EndElementHandler = self._end |
| self._parser.CharacterDataHandler = self._data |
| self._parser.ExternalEntityRefHandler = self._external |
| self._parser.SkippedEntityHandler = self._skipped |
| # used for our horrible re-encoding hack |
| self._fed_data = [] |
| self._read_encoding = None |
| |
| def _xml_decl(self, version, encoding, standalone): |
| self._read_encoding = encoding |
| |
| def _start(self, tag, attrib_in): |
| self._fed_data = None |
| tag = _fixname(tag) |
| attrib = OrderedDict() |
| if attrib_in: |
| for i in range(0, len(attrib_in), 2): |
| attrib[_fixname(attrib_in[i])] = attrib_in[i+1] |
| return self._target.start(tag, attrib) |
| |
| def _data(self, text): |
| return self._target.data(text) |
| |
| def _end(self, tag): |
| return self._target.end(_fixname(tag)) |
| |
| def _external(self, context, base, system_id, public_id): |
| if public_id in { |
| "-//W3C//DTD XHTML 1.0 Transitional//EN", |
| "-//W3C//DTD XHTML 1.1//EN", |
| "-//W3C//DTD XHTML 1.0 Strict//EN", |
| "-//W3C//DTD XHTML 1.0 Frameset//EN", |
| "-//W3C//DTD XHTML Basic 1.0//EN", |
| "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN", |
| "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN", |
| "-//W3C//DTD MathML 2.0//EN", |
| "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" |
| }: |
| parser = self._parser.ExternalEntityParserCreate(context) |
| with open(join(_catalog, "xhtml.dtd"), "rb") as fp: |
| try: |
| parser.ParseFile(fp) |
| except expat.error: |
| return False |
| |
| return True |
| |
| def _skipped(self, name, is_parameter_entity): |
| err = expat.error("undefined entity %s: line %d, column %d" % |
| (name, self._parser.ErrorLineNumber, |
| self._parser.ErrorColumnNumber)) |
| err.code = expat.errors.XML_ERROR_UNDEFINED_ENTITY |
| err.lineno = self._parser.ErrorLineNumber |
| err.offset = self._parser.ErrorColumnNumber |
| raise err |
| |
| def feed(self, data): |
| if self._fed_data is not None: |
| self._fed_data.append(data) |
| try: |
| self._parser.Parse(data, False) |
| except expat.error as v: |
| _wrap_error(v) |
| except ValueError as e: |
| if e.args[0] == 'multi-byte encodings are not supported': |
| assert self._read_encoding is not None |
| xml = b"".join(self._fed_data).decode(self._read_encoding).encode("utf-8") |
| new_parser = XMLParser("utf-8") |
| self._parser = new_parser._parser |
| self._target = new_parser._target |
| self._fed_data = None |
| self.feed(xml) |
| |
| def close(self): |
| try: |
| self._parser.Parse("", True) |
| except expat.error as v: |
| _wrap_error(v) |
| tree = self._target.close() |
| return tree |