#! /usr/bin/python """\ XPath built-in for cwm See cwm.py """ import thing import notation3 # N3 parsers and generators, and RDF generator from thing import * from xml.xpath import Evaluate from xml.dom import ext from xml.dom.ext.reader import HtmlLib from xml.dom.ext.reader import Sgmlop XPATH_NS_URI = "http://www.mnot.net/2002/02/18/cwm_xpath#" __version__ = "0.1" class BI_MatchHtmlFragment(LightBuiltIn, Function): """\ a built-in for HTML scraping using XPath. takes a list of 2 strings; the first is the input data, and the second is an xpath expression which yeilds one or more strings. Combines the matched string(s) into a single string. The input data is assumed to be an HTML fragment; if it's an entire HTML page (including ...), use matchHtml. The XPath engine is forgiving; it will take any old HTML, not just XHTML. However, your expression MUST use all caps for element names. """ def evaluateObject(self, store, context, subj, subj_py): if thing.verbosity() > 80: progress("xpath input:"+`subj_py`) str, pat = subj_py reader = Reader() dom = reader.fromString("%s" % str) found = Evaluate(pat, dom.documentElement) if found: return store._fromPython(getdata(found)) else: return None class BI_MatchHtml(LightBuiltIn, Function): """\ a built-in for HTML scraping using XPath. takes a list of 2 strings; the first is the input data, and the second is an xpath expression which yeilds one or more strings. Combines the matched string(s) into a single string. The input data is assumed to be an entire HTML page; if it's an HTML fragment, use matchHtmlFragment. The XPath engine is forgiving; it will take any old HTML, not just XHTML. However, your expression MUST use all caps for element names. """ def evaluateObject(self, store, context, subj, subj_py): if thing.verbosity() > 80: progress("xpath input:"+`subj_py`) str, pat = subj_py reader = Reader() dom = reader.fromString(str) found = Evaluate(pat, dom.documentElement) if found: return store._fromPython(getdata(found)) else: return None def getdata(inp): o = [] for i in inp: if i.__nodeName == '#text': o.append(i._get_data()) else: o.append(i._get_value()) return string.join(o, ' ') class Reader(HtmlLib.Reader): def __init__(self): self.parser = MyHtmlParser() class MyHtmlParser(Sgmlop.HtmlParser): def handle_special(self, data): pass def register(store): xp = store.internURI(XPATH_NS_URI[:-1]) xp.internFrag("matchHtmlFragment", BI_MatchHtmlFragment) xp.internFrag("matchHtml", BI_MatchHtml)