import sys import logging import sched import xmldata import xml.parsers.expat class XML(sched.Agent): """XML: An Agent which takes data from another Agent and parses it into XML stanzas. This only deals with input. It has no code to generate or send XML. The XML document is assumed to have the form ... In other words, children of the top-level tag are considered to be the basic units, and an event is triggered each time one completes. Character data between stanzas is ignored. XML(agent, encoding='UTF-8') -- constructor. The *agent* must be of a type that generates 'handle' string events. (For example, tcp.TCP.) If the agent stops, or if it generates an XML closing tag, the XML agent will itself stop. If the *agent* generates an 'error' event, the XML agent will repeat it. By default, *encoding* is UTF-8, and so the data that arrives must be in that encoding. You can specify other encodings. If *encoding* is None, the data should be unicode objects (or convertible to unicode). In all cases, the stanzas that are generated will be unencoded -- they will contain Unicode elements where appropriate. Agent states and events: state 'start': Begin parsing XML. state 'body' (tag, namespace, attrs): The XML opening tag has been received. The handler arguments are the opening tag name, the xmlns attribute (if present), and a dict containing all the tag attributes (including xmlns). event 'stanza' (node): A stanza has been received. The argument is a Node object representing the stanza. event 'error' (exc, agent): An error was detected while parsing the XML. This is immediately followed by a jump to 'end'. state 'end': Stop parsing. (No error is generated if the XML structure is incomplete.) Publicly readable fields: docname -- the top-level tag name (once received) docattrs -- the top-level tag attributes, as a dict (once received) Public methods: basicstanza(nod) -- a simple event handler for 'stanza'. Internal methods: setup() -- 'start' state handler. handle() -- handler for incoming 'handle' events. endparse() -- 'end' state handler. """ logprefix = 'zymb.xml' def __init__(self, stream, encoding='UTF-8'): sched.Agent.__init__(self) self.stream = stream self.encoding = encoding self.docattrs = None self.docname = None self.xmlparse = None self.addhandler('start', self.setup) self.addhandler('end', self.endparse) # All the watchers on stream are marked as "secondary" -- that is, # the actions they generate will be low-priority. This ensures that # each XML stanza is completely handled before the next one is # considered. ac = sched.Action(self.handle) ac.secondary = True stream.addhandler('handle', ac) self.addcleanupaction(ac) ac = sched.Action(self.stop) ac.secondary = True stream.addhandler('end', ac) self.addcleanupaction(ac) ac = sched.Action(self.perform, 'error') ac.secondary = True stream.addhandler('error', ac) self.addcleanupaction(ac) def setup(self): """setup() -- internal 'start' state handler. Do not call. Create the NodeBuilder parser object and initialize it. """ self.xmlparse = ClientNodeGenerator(self, encoding=self.encoding) def handle(self, input): """handle(input) -- internal handler for 'handle' events generated by self.stream. Do not call. Push the data into self.xmlparse. This will trigger 'stanza' events for each XML stanza which is completed. """ try: self.xmlparse.parse(input) except xml.parsers.expat.ExpatError, ex: self.log.warning('XML error: %s', ex) self.perform('error', ex, self) self.stop() def basicstanza(self, nod): """basicstanza(nod) -> None A simple event handler for 'stanza'. This simply writes the received data to stdout: sys.stdout.write(str(nod)) This handler is not installed by default. You can enable it by calling: tcp.addhandler('stanza', tcp.basicstanza) """ sys.stdout.write(str(nod)) def endparse(self): """endparse() -- internal 'end' state handler. Do not call. Shut down and delete self.xmlparse. """ if (self.xmlparse): self.xmlparse.close() self.xmlparse = None self.log.info('destroyed xml parser') class ClientNodeGenerator(xmldata.NodeGenerator): """ClientNodeGenerator: A NodeGenerator which parses a sequence of incoming character data, and dispatches stanzas (children of the top-level XML node) as they are completed. ClientNodeGenerator(owner, encoding='UTF-8') -- constructor. The *owner* is an XML Agent on which to trigger events as stanzas arrive. By default, *encoding* is UTF-8, and so the data that arrives must be in that encoding. You can specify other encodings. If *encoding* is None, the data should be unicode objects (or convertible to unicode). In all cases, the stanzas that are generated will be unencoded -- they will contain Unicode elements where appropriate. Public method: parse() -- accept data, and push it into the expat parser. """ def __init__(self, owner, encoding='UTF-8'): self.owner = owner xmldata.NodeGenerator.__init__(self, encoding=encoding) def handle_startelement(self, name, attrs): """handle_startelement() -- expat parser callback. Do not call. This watches for the initial top-level of the document, and jumps to the 'body' state. """ xmldata.NodeGenerator.handle_startelement(self, name, attrs) if (self.depth == 1): name = self.curnode.getname() namespace = self.curnode.getnamespace() self.owner.log.debug('got beginning of doc <%s xmlns=\'%s\'>', name, namespace) if (self.owner.state == 'start'): attrs = self.curnode.getattrs().copy() self.owner.docname = name self.owner.docattrs = attrs self.owner.jump('body', name, namespace, attrs) def handle_endelement(self, name): """handle_endelement() -- expat parser callback. Do not call. This watches for completion of a stanza (child of the top-level node), and triggers an event. The stanza is removed from the XML tree before dispatching, so that the tree doesn't grow forever. This also watches for the completion of the top-level document, at which time it shuts down. """ xmldata.NodeGenerator.handle_endelement(self, name) if (self.depth == 1): nod = self.curnode.getchild() nod.remove(True) if (self.owner.log.isEnabledFor(logging.DEBUG)): self.owner.log.debug('received:\n%s', nod.serialize(True)) self.owner.perform('stanza', nod) if (self.depth == 0): self.owner.log.debug('got end of doc') self.owner.stop() def handle_data(self, data): """handle_data() -- expat parser callback. Do not call. This ignores character data in between stanzas. Some Jabber servers send whitespace between stanzas to exercise the connection, and we don't want that accumulating in the XML tree. """ if (self.depth <= 1): return xmldata.NodeGenerator.handle_data(self, data) def parse(self, data): """parse(data) -> None Accept data, and push it into the expat parser. The *data* may be a str in some encoding, or a unicode, depending on the original *encoding* specified at construction time. """ self.parser.Parse(data)