#!/usr/bin/env python """ HtmlDom.py - Fetch a URI to an (X)HTML resource and return its DOM Fetching arbitrary HTML from the Web triggers a number of bugs and limitations in PyXML's DOM parsers. This convenience function tries to navigate around as many of them as possible, in order to return a navigable DOM for arbitrary URIs that return HTML or XHTML representations. Applications which use this library should set UAString appropriately, so that they can be reliably identified. Caveats: * This library has only been tested with PyXML 0.71; other versions may or may not work as designed. * PyXML's HTML parser returns a DOM with uppercased element names, while XHTML uses lowercase element names. As a result, code which uses the resultant DOM needs to be aware of its source; this can be achieved by using the .isXml() and .isHtml() methods. For example: d = fetch(uri) if d.isHtml(): title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d) elif d.isXml(): title = xml.xpath.Evaluate("/html/head/title/text()", d) * HTML is transformed into a DOM, sometimes losing (ususally unimportant) information, such as the DTD, the lang attribute of HTML, certain character references, etc. As a result, it is NOT recommended to create HTML for display from the returned DOM. * If a URI returns an XHTML representation, it may be fetched from the server twice. """ __license__ = """ Copyright (c) 2005 Mark Nottingham Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ __version__ = "0.31" import signal, xml, urllib from xml.dom.ext.reader import HtmlLib, Sgmlop, PyExpat UAString = "HtmlDom.py/%s (http://www.mnot.net/python/HtmlDom.py)" % __version__ def fetch(uri, timeout=60): """Fetch a URI to an (X)HTML resource and return the DOM""" urllib.URLopener.version = UAString signal.signal(signal.SIGALRM, _alrm_handler) reader = _HTMLReader() try: signal.alarm(timeout) try: m = reader.fromUri(uri) finally: signal.alarm(0) except xml.dom.NamespaceErr: # it's XML reader = _XHTMLReader() signal.alarm(timeout) try: m = reader.fromUri(uri) finally: signal.alarm(0) return m def _alrm_handler(signum, frame): raise IOError, 'timeout' class _XHTMLReader(PyExpat.Reader): pass class _HTMLReader(HtmlLib.Reader): def __init__(self): self.parser = _HtmlParser() class _HtmlParser(Sgmlop.HtmlParser): """ Overrride the handle_special method in HtmlParser so that we don't choke on HTML doctype declarations. Also, override handle_charref, as javascript seems to make later versions of PyXML extremely unhappy. """ def handle_special(self, data): pass def handle_charref(self, data): pass if __name__ == '__main__': import sys, xml.xpath d = fetch(sys.argv[1]) if d.isHtml(): title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d) elif d.isXml(): title = xml.xpath.Evaluate("/html/head/title/text()", d) xml.dom.ext.PrettyPrint(title[0])