| 1 | """
|
|---|
| 2 | SAX driver for the pyexpat C module. This driver works with
|
|---|
| 3 | pyexpat.__version__ == '2.22'.
|
|---|
| 4 | """
|
|---|
| 5 |
|
|---|
| 6 | version = "0.20"
|
|---|
| 7 |
|
|---|
| 8 | from xml.sax._exceptions import *
|
|---|
| 9 | from xml.sax.handler import feature_validation, feature_namespaces
|
|---|
| 10 | from xml.sax.handler import feature_namespace_prefixes
|
|---|
| 11 | from xml.sax.handler import feature_external_ges, feature_external_pes
|
|---|
| 12 | from xml.sax.handler import feature_string_interning
|
|---|
| 13 | from xml.sax.handler import property_xml_string, property_interning_dict
|
|---|
| 14 |
|
|---|
| 15 | # xml.parsers.expat does not raise ImportError in Jython
|
|---|
| 16 | import sys
|
|---|
| 17 | if sys.platform[:4] == "java":
|
|---|
| 18 | raise SAXReaderNotAvailable("expat not available in Java", None)
|
|---|
| 19 | del sys
|
|---|
| 20 |
|
|---|
| 21 | try:
|
|---|
| 22 | from xml.parsers import expat
|
|---|
| 23 | except ImportError:
|
|---|
| 24 | raise SAXReaderNotAvailable("expat not supported", None)
|
|---|
| 25 | else:
|
|---|
| 26 | if not hasattr(expat, "ParserCreate"):
|
|---|
| 27 | raise SAXReaderNotAvailable("expat not supported", None)
|
|---|
| 28 | from xml.sax import xmlreader, saxutils, handler
|
|---|
| 29 |
|
|---|
| 30 | AttributesImpl = xmlreader.AttributesImpl
|
|---|
| 31 | AttributesNSImpl = xmlreader.AttributesNSImpl
|
|---|
| 32 |
|
|---|
| 33 | # If we're using a sufficiently recent version of Python, we can use
|
|---|
| 34 | # weak references to avoid cycles between the parser and content
|
|---|
| 35 | # handler, otherwise we'll just have to pretend.
|
|---|
| 36 | try:
|
|---|
| 37 | import _weakref
|
|---|
| 38 | except ImportError:
|
|---|
| 39 | def _mkproxy(o):
|
|---|
| 40 | return o
|
|---|
| 41 | else:
|
|---|
| 42 | import weakref
|
|---|
| 43 | _mkproxy = weakref.proxy
|
|---|
| 44 | del weakref, _weakref
|
|---|
| 45 |
|
|---|
| 46 | # --- ExpatLocator
|
|---|
| 47 |
|
|---|
| 48 | class ExpatLocator(xmlreader.Locator):
|
|---|
| 49 | """Locator for use with the ExpatParser class.
|
|---|
| 50 |
|
|---|
| 51 | This uses a weak reference to the parser object to avoid creating
|
|---|
| 52 | a circular reference between the parser and the content handler.
|
|---|
| 53 | """
|
|---|
| 54 | def __init__(self, parser):
|
|---|
| 55 | self._ref = _mkproxy(parser)
|
|---|
| 56 |
|
|---|
| 57 | def getColumnNumber(self):
|
|---|
| 58 | parser = self._ref
|
|---|
| 59 | if parser._parser is None:
|
|---|
| 60 | return None
|
|---|
| 61 | return parser._parser.ErrorColumnNumber
|
|---|
| 62 |
|
|---|
| 63 | def getLineNumber(self):
|
|---|
| 64 | parser = self._ref
|
|---|
| 65 | if parser._parser is None:
|
|---|
| 66 | return 1
|
|---|
| 67 | return parser._parser.ErrorLineNumber
|
|---|
| 68 |
|
|---|
| 69 | def getPublicId(self):
|
|---|
| 70 | parser = self._ref
|
|---|
| 71 | if parser is None:
|
|---|
| 72 | return None
|
|---|
| 73 | return parser._source.getPublicId()
|
|---|
| 74 |
|
|---|
| 75 | def getSystemId(self):
|
|---|
| 76 | parser = self._ref
|
|---|
| 77 | if parser is None:
|
|---|
| 78 | return None
|
|---|
| 79 | return parser._source.getSystemId()
|
|---|
| 80 |
|
|---|
| 81 |
|
|---|
| 82 | # --- ExpatParser
|
|---|
| 83 |
|
|---|
| 84 | class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
|
|---|
| 85 | """SAX driver for the pyexpat C module."""
|
|---|
| 86 |
|
|---|
| 87 | def __init__(self, namespaceHandling=0, bufsize=2**16-20):
|
|---|
| 88 | xmlreader.IncrementalParser.__init__(self, bufsize)
|
|---|
| 89 | self._source = xmlreader.InputSource()
|
|---|
| 90 | self._parser = None
|
|---|
| 91 | self._namespaces = namespaceHandling
|
|---|
| 92 | self._lex_handler_prop = None
|
|---|
| 93 | self._parsing = 0
|
|---|
| 94 | self._entity_stack = []
|
|---|
| 95 | self._external_ges = 1
|
|---|
| 96 | self._interning = None
|
|---|
| 97 |
|
|---|
| 98 | # XMLReader methods
|
|---|
| 99 |
|
|---|
| 100 | def parse(self, source):
|
|---|
| 101 | "Parse an XML document from a URL or an InputSource."
|
|---|
| 102 | source = saxutils.prepare_input_source(source)
|
|---|
| 103 |
|
|---|
| 104 | self._source = source
|
|---|
| 105 | self.reset()
|
|---|
| 106 | self._cont_handler.setDocumentLocator(ExpatLocator(self))
|
|---|
| 107 | xmlreader.IncrementalParser.parse(self, source)
|
|---|
| 108 |
|
|---|
| 109 | def prepareParser(self, source):
|
|---|
| 110 | if source.getSystemId() != None:
|
|---|
| 111 | self._parser.SetBase(source.getSystemId())
|
|---|
| 112 |
|
|---|
| 113 | # Redefined setContentHandler to allow changing handlers during parsing
|
|---|
| 114 |
|
|---|
| 115 | def setContentHandler(self, handler):
|
|---|
| 116 | xmlreader.IncrementalParser.setContentHandler(self, handler)
|
|---|
| 117 | if self._parsing:
|
|---|
| 118 | self._reset_cont_handler()
|
|---|
| 119 |
|
|---|
| 120 | def getFeature(self, name):
|
|---|
| 121 | if name == feature_namespaces:
|
|---|
| 122 | return self._namespaces
|
|---|
| 123 | elif name == feature_string_interning:
|
|---|
| 124 | return self._interning is not None
|
|---|
| 125 | elif name in (feature_validation, feature_external_pes,
|
|---|
| 126 | feature_namespace_prefixes):
|
|---|
| 127 | return 0
|
|---|
| 128 | elif name == feature_external_ges:
|
|---|
| 129 | return self._external_ges
|
|---|
| 130 | raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
|
|---|
| 131 |
|
|---|
| 132 | def setFeature(self, name, state):
|
|---|
| 133 | if self._parsing:
|
|---|
| 134 | raise SAXNotSupportedException("Cannot set features while parsing")
|
|---|
| 135 |
|
|---|
| 136 | if name == feature_namespaces:
|
|---|
| 137 | self._namespaces = state
|
|---|
| 138 | elif name == feature_external_ges:
|
|---|
| 139 | self._external_ges = state
|
|---|
| 140 | elif name == feature_string_interning:
|
|---|
| 141 | if state:
|
|---|
| 142 | if self._interning is None:
|
|---|
| 143 | self._interning = {}
|
|---|
| 144 | else:
|
|---|
| 145 | self._interning = None
|
|---|
| 146 | elif name == feature_validation:
|
|---|
| 147 | if state:
|
|---|
| 148 | raise SAXNotSupportedException(
|
|---|
| 149 | "expat does not support validation")
|
|---|
| 150 | elif name == feature_external_pes:
|
|---|
| 151 | if state:
|
|---|
| 152 | raise SAXNotSupportedException(
|
|---|
| 153 | "expat does not read external parameter entities")
|
|---|
| 154 | elif name == feature_namespace_prefixes:
|
|---|
| 155 | if state:
|
|---|
| 156 | raise SAXNotSupportedException(
|
|---|
| 157 | "expat does not report namespace prefixes")
|
|---|
| 158 | else:
|
|---|
| 159 | raise SAXNotRecognizedException(
|
|---|
| 160 | "Feature '%s' not recognized" % name)
|
|---|
| 161 |
|
|---|
| 162 | def getProperty(self, name):
|
|---|
| 163 | if name == handler.property_lexical_handler:
|
|---|
| 164 | return self._lex_handler_prop
|
|---|
| 165 | elif name == property_interning_dict:
|
|---|
| 166 | return self._interning
|
|---|
| 167 | elif name == property_xml_string:
|
|---|
| 168 | if self._parser:
|
|---|
| 169 | if hasattr(self._parser, "GetInputContext"):
|
|---|
| 170 | return self._parser.GetInputContext()
|
|---|
| 171 | else:
|
|---|
| 172 | raise SAXNotRecognizedException(
|
|---|
| 173 | "This version of expat does not support getting"
|
|---|
| 174 | " the XML string")
|
|---|
| 175 | else:
|
|---|
| 176 | raise SAXNotSupportedException(
|
|---|
| 177 | "XML string cannot be returned when not parsing")
|
|---|
| 178 | raise SAXNotRecognizedException("Property '%s' not recognized" % name)
|
|---|
| 179 |
|
|---|
| 180 | def setProperty(self, name, value):
|
|---|
| 181 | if name == handler.property_lexical_handler:
|
|---|
| 182 | self._lex_handler_prop = value
|
|---|
| 183 | if self._parsing:
|
|---|
| 184 | self._reset_lex_handler_prop()
|
|---|
| 185 | elif name == property_interning_dict:
|
|---|
| 186 | self._interning = value
|
|---|
| 187 | elif name == property_xml_string:
|
|---|
| 188 | raise SAXNotSupportedException("Property '%s' cannot be set" %
|
|---|
| 189 | name)
|
|---|
| 190 | else:
|
|---|
| 191 | raise SAXNotRecognizedException("Property '%s' not recognized" %
|
|---|
| 192 | name)
|
|---|
| 193 |
|
|---|
| 194 | # IncrementalParser methods
|
|---|
| 195 |
|
|---|
| 196 | def feed(self, data, isFinal = 0):
|
|---|
| 197 | if not self._parsing:
|
|---|
| 198 | self.reset()
|
|---|
| 199 | self._parsing = 1
|
|---|
| 200 | self._cont_handler.startDocument()
|
|---|
| 201 |
|
|---|
| 202 | try:
|
|---|
| 203 | # The isFinal parameter is internal to the expat reader.
|
|---|
| 204 | # If it is set to true, expat will check validity of the entire
|
|---|
| 205 | # document. When feeding chunks, they are not normally final -
|
|---|
| 206 | # except when invoked from close.
|
|---|
| 207 | self._parser.Parse(data, isFinal)
|
|---|
| 208 | except expat.error, e:
|
|---|
| 209 | exc = SAXParseException(expat.ErrorString(e.code), e, self)
|
|---|
| 210 | # FIXME: when to invoke error()?
|
|---|
| 211 | self._err_handler.fatalError(exc)
|
|---|
| 212 |
|
|---|
| 213 | def close(self):
|
|---|
| 214 | if self._entity_stack:
|
|---|
| 215 | # If we are completing an external entity, do nothing here
|
|---|
| 216 | return
|
|---|
| 217 | self.feed("", isFinal = 1)
|
|---|
| 218 | self._cont_handler.endDocument()
|
|---|
| 219 | self._parsing = 0
|
|---|
| 220 | # break cycle created by expat handlers pointing to our methods
|
|---|
| 221 | self._parser = None
|
|---|
| 222 |
|
|---|
| 223 | def _reset_cont_handler(self):
|
|---|
| 224 | self._parser.ProcessingInstructionHandler = \
|
|---|
| 225 | self._cont_handler.processingInstruction
|
|---|
| 226 | self._parser.CharacterDataHandler = self._cont_handler.characters
|
|---|
| 227 |
|
|---|
| 228 | def _reset_lex_handler_prop(self):
|
|---|
| 229 | lex = self._lex_handler_prop
|
|---|
| 230 | parser = self._parser
|
|---|
| 231 | if lex is None:
|
|---|
| 232 | parser.CommentHandler = None
|
|---|
| 233 | parser.StartCdataSectionHandler = None
|
|---|
| 234 | parser.EndCdataSectionHandler = None
|
|---|
| 235 | parser.StartDoctypeDeclHandler = None
|
|---|
| 236 | parser.EndDoctypeDeclHandler = None
|
|---|
| 237 | else:
|
|---|
| 238 | parser.CommentHandler = lex.comment
|
|---|
| 239 | parser.StartCdataSectionHandler = lex.startCDATA
|
|---|
| 240 | parser.EndCdataSectionHandler = lex.endCDATA
|
|---|
| 241 | parser.StartDoctypeDeclHandler = self.start_doctype_decl
|
|---|
| 242 | parser.EndDoctypeDeclHandler = lex.endDTD
|
|---|
| 243 |
|
|---|
| 244 | def reset(self):
|
|---|
| 245 | if self._namespaces:
|
|---|
| 246 | self._parser = expat.ParserCreate(self._source.getEncoding(), " ",
|
|---|
| 247 | intern=self._interning)
|
|---|
| 248 | self._parser.namespace_prefixes = 1
|
|---|
| 249 | self._parser.StartElementHandler = self.start_element_ns
|
|---|
| 250 | self._parser.EndElementHandler = self.end_element_ns
|
|---|
| 251 | else:
|
|---|
| 252 | self._parser = expat.ParserCreate(self._source.getEncoding(),
|
|---|
| 253 | intern = self._interning)
|
|---|
| 254 | self._parser.StartElementHandler = self.start_element
|
|---|
| 255 | self._parser.EndElementHandler = self.end_element
|
|---|
| 256 |
|
|---|
| 257 | self._reset_cont_handler()
|
|---|
| 258 | self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl
|
|---|
| 259 | self._parser.NotationDeclHandler = self.notation_decl
|
|---|
| 260 | self._parser.StartNamespaceDeclHandler = self.start_namespace_decl
|
|---|
| 261 | self._parser.EndNamespaceDeclHandler = self.end_namespace_decl
|
|---|
| 262 |
|
|---|
| 263 | self._decl_handler_prop = None
|
|---|
| 264 | if self._lex_handler_prop:
|
|---|
| 265 | self._reset_lex_handler_prop()
|
|---|
| 266 | # self._parser.DefaultHandler =
|
|---|
| 267 | # self._parser.DefaultHandlerExpand =
|
|---|
| 268 | # self._parser.NotStandaloneHandler =
|
|---|
| 269 | self._parser.ExternalEntityRefHandler = self.external_entity_ref
|
|---|
| 270 | try:
|
|---|
| 271 | self._parser.SkippedEntityHandler = self.skipped_entity_handler
|
|---|
| 272 | except AttributeError:
|
|---|
| 273 | # This pyexpat does not support SkippedEntity
|
|---|
| 274 | pass
|
|---|
| 275 | self._parser.SetParamEntityParsing(
|
|---|
| 276 | expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)
|
|---|
| 277 |
|
|---|
| 278 | self._parsing = 0
|
|---|
| 279 | self._entity_stack = []
|
|---|
| 280 |
|
|---|
| 281 | # Locator methods
|
|---|
| 282 |
|
|---|
| 283 | def getColumnNumber(self):
|
|---|
| 284 | if self._parser is None:
|
|---|
| 285 | return None
|
|---|
| 286 | return self._parser.ErrorColumnNumber
|
|---|
| 287 |
|
|---|
| 288 | def getLineNumber(self):
|
|---|
| 289 | if self._parser is None:
|
|---|
| 290 | return 1
|
|---|
| 291 | return self._parser.ErrorLineNumber
|
|---|
| 292 |
|
|---|
| 293 | def getPublicId(self):
|
|---|
| 294 | return self._source.getPublicId()
|
|---|
| 295 |
|
|---|
| 296 | def getSystemId(self):
|
|---|
| 297 | return self._source.getSystemId()
|
|---|
| 298 |
|
|---|
| 299 | # event handlers
|
|---|
| 300 | def start_element(self, name, attrs):
|
|---|
| 301 | self._cont_handler.startElement(name, AttributesImpl(attrs))
|
|---|
| 302 |
|
|---|
| 303 | def end_element(self, name):
|
|---|
| 304 | self._cont_handler.endElement(name)
|
|---|
| 305 |
|
|---|
| 306 | def start_element_ns(self, name, attrs):
|
|---|
| 307 | pair = name.split()
|
|---|
| 308 | if len(pair) == 1:
|
|---|
| 309 | # no namespace
|
|---|
| 310 | pair = (None, name)
|
|---|
| 311 | elif len(pair) == 3:
|
|---|
| 312 | pair = pair[0], pair[1]
|
|---|
| 313 | else:
|
|---|
| 314 | # default namespace
|
|---|
| 315 | pair = tuple(pair)
|
|---|
| 316 |
|
|---|
| 317 | newattrs = {}
|
|---|
| 318 | qnames = {}
|
|---|
| 319 | for (aname, value) in attrs.items():
|
|---|
| 320 | parts = aname.split()
|
|---|
| 321 | length = len(parts)
|
|---|
| 322 | if length == 1:
|
|---|
| 323 | # no namespace
|
|---|
| 324 | qname = aname
|
|---|
| 325 | apair = (None, aname)
|
|---|
| 326 | elif length == 3:
|
|---|
| 327 | qname = "%s:%s" % (parts[2], parts[1])
|
|---|
| 328 | apair = parts[0], parts[1]
|
|---|
| 329 | else:
|
|---|
| 330 | # default namespace
|
|---|
| 331 | qname = parts[1]
|
|---|
| 332 | apair = tuple(parts)
|
|---|
| 333 |
|
|---|
| 334 | newattrs[apair] = value
|
|---|
| 335 | qnames[apair] = qname
|
|---|
| 336 |
|
|---|
| 337 | self._cont_handler.startElementNS(pair, None,
|
|---|
| 338 | AttributesNSImpl(newattrs, qnames))
|
|---|
| 339 |
|
|---|
| 340 | def end_element_ns(self, name):
|
|---|
| 341 | pair = name.split()
|
|---|
| 342 | if len(pair) == 1:
|
|---|
| 343 | pair = (None, name)
|
|---|
| 344 | elif len(pair) == 3:
|
|---|
| 345 | pair = pair[0], pair[1]
|
|---|
| 346 | else:
|
|---|
| 347 | pair = tuple(pair)
|
|---|
| 348 |
|
|---|
| 349 | self._cont_handler.endElementNS(pair, None)
|
|---|
| 350 |
|
|---|
| 351 | # this is not used (call directly to ContentHandler)
|
|---|
| 352 | def processing_instruction(self, target, data):
|
|---|
| 353 | self._cont_handler.processingInstruction(target, data)
|
|---|
| 354 |
|
|---|
| 355 | # this is not used (call directly to ContentHandler)
|
|---|
| 356 | def character_data(self, data):
|
|---|
| 357 | self._cont_handler.characters(data)
|
|---|
| 358 |
|
|---|
| 359 | def start_namespace_decl(self, prefix, uri):
|
|---|
| 360 | self._cont_handler.startPrefixMapping(prefix, uri)
|
|---|
| 361 |
|
|---|
| 362 | def end_namespace_decl(self, prefix):
|
|---|
| 363 | self._cont_handler.endPrefixMapping(prefix)
|
|---|
| 364 |
|
|---|
| 365 | def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):
|
|---|
| 366 | self._lex_handler_prop.startDTD(name, pubid, sysid)
|
|---|
| 367 |
|
|---|
| 368 | def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):
|
|---|
| 369 | self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)
|
|---|
| 370 |
|
|---|
| 371 | def notation_decl(self, name, base, sysid, pubid):
|
|---|
| 372 | self._dtd_handler.notationDecl(name, pubid, sysid)
|
|---|
| 373 |
|
|---|
| 374 | def external_entity_ref(self, context, base, sysid, pubid):
|
|---|
| 375 | if not self._external_ges:
|
|---|
| 376 | return 1
|
|---|
| 377 |
|
|---|
| 378 | source = self._ent_handler.resolveEntity(pubid, sysid)
|
|---|
| 379 | source = saxutils.prepare_input_source(source,
|
|---|
| 380 | self._source.getSystemId() or
|
|---|
| 381 | "")
|
|---|
| 382 |
|
|---|
| 383 | self._entity_stack.append((self._parser, self._source))
|
|---|
| 384 | self._parser = self._parser.ExternalEntityParserCreate(context)
|
|---|
| 385 | self._source = source
|
|---|
| 386 |
|
|---|
| 387 | try:
|
|---|
| 388 | xmlreader.IncrementalParser.parse(self, source)
|
|---|
| 389 | except:
|
|---|
| 390 | return 0 # FIXME: save error info here?
|
|---|
| 391 |
|
|---|
| 392 | (self._parser, self._source) = self._entity_stack[-1]
|
|---|
| 393 | del self._entity_stack[-1]
|
|---|
| 394 | return 1
|
|---|
| 395 |
|
|---|
| 396 | def skipped_entity_handler(self, name, is_pe):
|
|---|
| 397 | if is_pe:
|
|---|
| 398 | # The SAX spec requires to report skipped PEs with a '%'
|
|---|
| 399 | name = '%'+name
|
|---|
| 400 | self._cont_handler.skippedEntity(name)
|
|---|
| 401 |
|
|---|
| 402 | # ---
|
|---|
| 403 |
|
|---|
| 404 | def create_parser(*args, **kwargs):
|
|---|
| 405 | return ExpatParser(*args, **kwargs)
|
|---|
| 406 |
|
|---|
| 407 | # ---
|
|---|
| 408 |
|
|---|
| 409 | if __name__ == "__main__":
|
|---|
| 410 | import xml.sax
|
|---|
| 411 | p = create_parser()
|
|---|
| 412 | p.setContentHandler(xml.sax.XMLGenerator())
|
|---|
| 413 | p.setErrorHandler(xml.sax.ErrorHandler())
|
|---|
| 414 | p.parse("../../../hamlet.xml")
|
|---|