#!/usr/bin/env python
"""
HtmlDom.py - Fetch a URI to an (X)HTML resource and return its DOM
Fetching arbitrary HTML from the Web triggers a number of bugs and
limitations in PyXML's DOM parsers. This convenience function tries
to navigate around as many of them as possible, in order to return
a navigable DOM for arbitrary URIs that return HTML or XHTML
representations.
Applications which use this library should set UAString appropriately,
so that they can be reliably identified.
Caveats:
* This library has only been tested with PyXML 0.71; other versions
may or may not work as designed.
* PyXML's HTML parser returns a DOM with uppercased element names,
while XHTML uses lowercase element names. As a result, code which
uses the resultant DOM needs to be aware of its source; this can
be achieved by using the .isXml() and .isHtml() methods. For
example:
d = fetch(uri)
if d.isHtml():
title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d)
elif d.isXml():
title = xml.xpath.Evaluate("/html/head/title/text()", d)
* HTML is transformed into a DOM, sometimes losing (ususally
unimportant) information, such as the DTD, the lang attribute of
HTML, certain character references, etc. As a result, it is NOT
recommended to create HTML for display from the returned DOM.
* If a URI returns an XHTML representation, it may be fetched from
the server twice.
"""
__license__ = """
Copyright (c) 2005 Mark Nottingham
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
__version__ = "0.31"
import signal, xml, urllib
from xml.dom.ext.reader import HtmlLib, Sgmlop, PyExpat
UAString = "HtmlDom.py/%s (http://www.mnot.net/python/HtmlDom.py)" % __version__
def fetch(uri, timeout=60):
"""Fetch a URI to an (X)HTML resource and return the DOM"""
urllib.URLopener.version = UAString
signal.signal(signal.SIGALRM, _alrm_handler)
reader = _HTMLReader()
try:
signal.alarm(timeout)
try:
m = reader.fromUri(uri)
finally:
signal.alarm(0)
except xml.dom.NamespaceErr: # it's XML
reader = _XHTMLReader()
signal.alarm(timeout)
try:
m = reader.fromUri(uri)
finally:
signal.alarm(0)
return m
def _alrm_handler(signum, frame):
raise IOError, 'timeout'
class _XHTMLReader(PyExpat.Reader):
pass
class _HTMLReader(HtmlLib.Reader):
def __init__(self):
self.parser = _HtmlParser()
class _HtmlParser(Sgmlop.HtmlParser):
"""
Overrride the handle_special method in HtmlParser so that we don't
choke on HTML doctype declarations.
Also, override handle_charref, as javascript seems to make later
versions of PyXML extremely unhappy.
"""
def handle_special(self, data):
pass
def handle_charref(self, data):
pass
if __name__ == '__main__':
import sys, xml.xpath
d = fetch(sys.argv[1])
if d.isHtml():
title = xml.xpath.Evaluate("/HTML/HEAD/TITLE/text()", d)
elif d.isXml():
title = xml.xpath.Evaluate("/html/head/title/text()", d)
xml.dom.ext.PrettyPrint(title[0])