| 1 | """An XML Reader is the SAX 2 name for an XML parser. XML Parsers
|
|---|
| 2 | should be based on this code. """
|
|---|
| 3 |
|
|---|
| 4 | import handler
|
|---|
| 5 |
|
|---|
| 6 | from _exceptions import SAXNotSupportedException, SAXNotRecognizedException
|
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 | # ===== XMLREADER =====
|
|---|
| 10 |
|
|---|
| 11 | class XMLReader:
|
|---|
| 12 | """Interface for reading an XML document using callbacks.
|
|---|
| 13 |
|
|---|
| 14 | XMLReader is the interface that an XML parser's SAX2 driver must
|
|---|
| 15 | implement. This interface allows an application to set and query
|
|---|
| 16 | features and properties in the parser, to register event handlers
|
|---|
| 17 | for document processing, and to initiate a document parse.
|
|---|
| 18 |
|
|---|
| 19 | All SAX interfaces are assumed to be synchronous: the parse
|
|---|
| 20 | methods must not return until parsing is complete, and readers
|
|---|
| 21 | must wait for an event-handler callback to return before reporting
|
|---|
| 22 | the next event."""
|
|---|
| 23 |
|
|---|
| 24 | def __init__(self):
|
|---|
| 25 | self._cont_handler = handler.ContentHandler()
|
|---|
| 26 | self._dtd_handler = handler.DTDHandler()
|
|---|
| 27 | self._ent_handler = handler.EntityResolver()
|
|---|
| 28 | self._err_handler = handler.ErrorHandler()
|
|---|
| 29 |
|
|---|
| 30 | def parse(self, source):
|
|---|
| 31 | "Parse an XML document from a system identifier or an InputSource."
|
|---|
| 32 | raise NotImplementedError("This method must be implemented!")
|
|---|
| 33 |
|
|---|
| 34 | def getContentHandler(self):
|
|---|
| 35 | "Returns the current ContentHandler."
|
|---|
| 36 | return self._cont_handler
|
|---|
| 37 |
|
|---|
| 38 | def setContentHandler(self, handler):
|
|---|
| 39 | "Registers a new object to receive document content events."
|
|---|
| 40 | self._cont_handler = handler
|
|---|
| 41 |
|
|---|
| 42 | def getDTDHandler(self):
|
|---|
| 43 | "Returns the current DTD handler."
|
|---|
| 44 | return self._dtd_handler
|
|---|
| 45 |
|
|---|
| 46 | def setDTDHandler(self, handler):
|
|---|
| 47 | "Register an object to receive basic DTD-related events."
|
|---|
| 48 | self._dtd_handler = handler
|
|---|
| 49 |
|
|---|
| 50 | def getEntityResolver(self):
|
|---|
| 51 | "Returns the current EntityResolver."
|
|---|
| 52 | return self._ent_handler
|
|---|
| 53 |
|
|---|
| 54 | def setEntityResolver(self, resolver):
|
|---|
| 55 | "Register an object to resolve external entities."
|
|---|
| 56 | self._ent_handler = resolver
|
|---|
| 57 |
|
|---|
| 58 | def getErrorHandler(self):
|
|---|
| 59 | "Returns the current ErrorHandler."
|
|---|
| 60 | return self._err_handler
|
|---|
| 61 |
|
|---|
| 62 | def setErrorHandler(self, handler):
|
|---|
| 63 | "Register an object to receive error-message events."
|
|---|
| 64 | self._err_handler = handler
|
|---|
| 65 |
|
|---|
| 66 | def setLocale(self, locale):
|
|---|
| 67 | """Allow an application to set the locale for errors and warnings.
|
|---|
| 68 |
|
|---|
| 69 | SAX parsers are not required to provide localization for errors
|
|---|
| 70 | and warnings; if they cannot support the requested locale,
|
|---|
| 71 | however, they must throw a SAX exception. Applications may
|
|---|
| 72 | request a locale change in the middle of a parse."""
|
|---|
| 73 | raise SAXNotSupportedException("Locale support not implemented")
|
|---|
| 74 |
|
|---|
| 75 | def getFeature(self, name):
|
|---|
| 76 | "Looks up and returns the state of a SAX2 feature."
|
|---|
| 77 | raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
|
|---|
| 78 |
|
|---|
| 79 | def setFeature(self, name, state):
|
|---|
| 80 | "Sets the state of a SAX2 feature."
|
|---|
| 81 | raise SAXNotRecognizedException("Feature '%s' not recognized" % name)
|
|---|
| 82 |
|
|---|
| 83 | def getProperty(self, name):
|
|---|
| 84 | "Looks up and returns the value of a SAX2 property."
|
|---|
| 85 | raise SAXNotRecognizedException("Property '%s' not recognized" % name)
|
|---|
| 86 |
|
|---|
| 87 | def setProperty(self, name, value):
|
|---|
| 88 | "Sets the value of a SAX2 property."
|
|---|
| 89 | raise SAXNotRecognizedException("Property '%s' not recognized" % name)
|
|---|
| 90 |
|
|---|
| 91 | class IncrementalParser(XMLReader):
|
|---|
| 92 | """This interface adds three extra methods to the XMLReader
|
|---|
| 93 | interface that allow XML parsers to support incremental
|
|---|
| 94 | parsing. Support for this interface is optional, since not all
|
|---|
| 95 | underlying XML parsers support this functionality.
|
|---|
| 96 |
|
|---|
| 97 | When the parser is instantiated it is ready to begin accepting
|
|---|
| 98 | data from the feed method immediately. After parsing has been
|
|---|
| 99 | finished with a call to close the reset method must be called to
|
|---|
| 100 | make the parser ready to accept new data, either from feed or
|
|---|
| 101 | using the parse method.
|
|---|
| 102 |
|
|---|
| 103 | Note that these methods must _not_ be called during parsing, that
|
|---|
| 104 | is, after parse has been called and before it returns.
|
|---|
| 105 |
|
|---|
| 106 | By default, the class also implements the parse method of the XMLReader
|
|---|
| 107 | interface using the feed, close and reset methods of the
|
|---|
| 108 | IncrementalParser interface as a convenience to SAX 2.0 driver
|
|---|
| 109 | writers."""
|
|---|
| 110 |
|
|---|
| 111 | def __init__(self, bufsize=2**16):
|
|---|
| 112 | self._bufsize = bufsize
|
|---|
| 113 | XMLReader.__init__(self)
|
|---|
| 114 |
|
|---|
| 115 | def parse(self, source):
|
|---|
| 116 | import saxutils
|
|---|
| 117 | source = saxutils.prepare_input_source(source)
|
|---|
| 118 |
|
|---|
| 119 | self.prepareParser(source)
|
|---|
| 120 | file = source.getByteStream()
|
|---|
| 121 | buffer = file.read(self._bufsize)
|
|---|
| 122 | while buffer != "":
|
|---|
| 123 | self.feed(buffer)
|
|---|
| 124 | buffer = file.read(self._bufsize)
|
|---|
| 125 | self.close()
|
|---|
| 126 |
|
|---|
| 127 | def feed(self, data):
|
|---|
| 128 | """This method gives the raw XML data in the data parameter to
|
|---|
| 129 | the parser and makes it parse the data, emitting the
|
|---|
| 130 | corresponding events. It is allowed for XML constructs to be
|
|---|
| 131 | split across several calls to feed.
|
|---|
| 132 |
|
|---|
| 133 | feed may raise SAXException."""
|
|---|
| 134 | raise NotImplementedError("This method must be implemented!")
|
|---|
| 135 |
|
|---|
| 136 | def prepareParser(self, source):
|
|---|
| 137 | """This method is called by the parse implementation to allow
|
|---|
| 138 | the SAX 2.0 driver to prepare itself for parsing."""
|
|---|
| 139 | raise NotImplementedError("prepareParser must be overridden!")
|
|---|
| 140 |
|
|---|
| 141 | def close(self):
|
|---|
| 142 | """This method is called when the entire XML document has been
|
|---|
| 143 | passed to the parser through the feed method, to notify the
|
|---|
| 144 | parser that there are no more data. This allows the parser to
|
|---|
| 145 | do the final checks on the document and empty the internal
|
|---|
| 146 | data buffer.
|
|---|
| 147 |
|
|---|
| 148 | The parser will not be ready to parse another document until
|
|---|
| 149 | the reset method has been called.
|
|---|
| 150 |
|
|---|
| 151 | close may raise SAXException."""
|
|---|
| 152 | raise NotImplementedError("This method must be implemented!")
|
|---|
| 153 |
|
|---|
| 154 | def reset(self):
|
|---|
| 155 | """This method is called after close has been called to reset
|
|---|
| 156 | the parser so that it is ready to parse new documents. The
|
|---|
| 157 | results of calling parse or feed after close without calling
|
|---|
| 158 | reset are undefined."""
|
|---|
| 159 | raise NotImplementedError("This method must be implemented!")
|
|---|
| 160 |
|
|---|
| 161 | # ===== LOCATOR =====
|
|---|
| 162 |
|
|---|
| 163 | class Locator:
|
|---|
| 164 | """Interface for associating a SAX event with a document
|
|---|
| 165 | location. A locator object will return valid results only during
|
|---|
| 166 | calls to DocumentHandler methods; at any other time, the
|
|---|
| 167 | results are unpredictable."""
|
|---|
| 168 |
|
|---|
| 169 | def getColumnNumber(self):
|
|---|
| 170 | "Return the column number where the current event ends."
|
|---|
| 171 | return -1
|
|---|
| 172 |
|
|---|
| 173 | def getLineNumber(self):
|
|---|
| 174 | "Return the line number where the current event ends."
|
|---|
| 175 | return -1
|
|---|
| 176 |
|
|---|
| 177 | def getPublicId(self):
|
|---|
| 178 | "Return the public identifier for the current event."
|
|---|
| 179 | return None
|
|---|
| 180 |
|
|---|
| 181 | def getSystemId(self):
|
|---|
| 182 | "Return the system identifier for the current event."
|
|---|
| 183 | return None
|
|---|
| 184 |
|
|---|
| 185 | # ===== INPUTSOURCE =====
|
|---|
| 186 |
|
|---|
| 187 | class InputSource:
|
|---|
| 188 | """Encapsulation of the information needed by the XMLReader to
|
|---|
| 189 | read entities.
|
|---|
| 190 |
|
|---|
| 191 | This class may include information about the public identifier,
|
|---|
| 192 | system identifier, byte stream (possibly with character encoding
|
|---|
| 193 | information) and/or the character stream of an entity.
|
|---|
| 194 |
|
|---|
| 195 | Applications will create objects of this class for use in the
|
|---|
| 196 | XMLReader.parse method and for returning from
|
|---|
| 197 | EntityResolver.resolveEntity.
|
|---|
| 198 |
|
|---|
| 199 | An InputSource belongs to the application, the XMLReader is not
|
|---|
| 200 | allowed to modify InputSource objects passed to it from the
|
|---|
| 201 | application, although it may make copies and modify those."""
|
|---|
| 202 |
|
|---|
| 203 | def __init__(self, system_id = None):
|
|---|
| 204 | self.__system_id = system_id
|
|---|
| 205 | self.__public_id = None
|
|---|
| 206 | self.__encoding = None
|
|---|
| 207 | self.__bytefile = None
|
|---|
| 208 | self.__charfile = None
|
|---|
| 209 |
|
|---|
| 210 | def setPublicId(self, public_id):
|
|---|
| 211 | "Sets the public identifier of this InputSource."
|
|---|
| 212 | self.__public_id = public_id
|
|---|
| 213 |
|
|---|
| 214 | def getPublicId(self):
|
|---|
| 215 | "Returns the public identifier of this InputSource."
|
|---|
| 216 | return self.__public_id
|
|---|
| 217 |
|
|---|
| 218 | def setSystemId(self, system_id):
|
|---|
| 219 | "Sets the system identifier of this InputSource."
|
|---|
| 220 | self.__system_id = system_id
|
|---|
| 221 |
|
|---|
| 222 | def getSystemId(self):
|
|---|
| 223 | "Returns the system identifier of this InputSource."
|
|---|
| 224 | return self.__system_id
|
|---|
| 225 |
|
|---|
| 226 | def setEncoding(self, encoding):
|
|---|
| 227 | """Sets the character encoding of this InputSource.
|
|---|
| 228 |
|
|---|
| 229 | The encoding must be a string acceptable for an XML encoding
|
|---|
| 230 | declaration (see section 4.3.3 of the XML recommendation).
|
|---|
| 231 |
|
|---|
| 232 | The encoding attribute of the InputSource is ignored if the
|
|---|
| 233 | InputSource also contains a character stream."""
|
|---|
| 234 | self.__encoding = encoding
|
|---|
| 235 |
|
|---|
| 236 | def getEncoding(self):
|
|---|
| 237 | "Get the character encoding of this InputSource."
|
|---|
| 238 | return self.__encoding
|
|---|
| 239 |
|
|---|
| 240 | def setByteStream(self, bytefile):
|
|---|
| 241 | """Set the byte stream (a Python file-like object which does
|
|---|
| 242 | not perform byte-to-character conversion) for this input
|
|---|
| 243 | source.
|
|---|
| 244 |
|
|---|
| 245 | The SAX parser will ignore this if there is also a character
|
|---|
| 246 | stream specified, but it will use a byte stream in preference
|
|---|
| 247 | to opening a URI connection itself.
|
|---|
| 248 |
|
|---|
| 249 | If the application knows the character encoding of the byte
|
|---|
| 250 | stream, it should set it with the setEncoding method."""
|
|---|
| 251 | self.__bytefile = bytefile
|
|---|
| 252 |
|
|---|
| 253 | def getByteStream(self):
|
|---|
| 254 | """Get the byte stream for this input source.
|
|---|
| 255 |
|
|---|
| 256 | The getEncoding method will return the character encoding for
|
|---|
| 257 | this byte stream, or None if unknown."""
|
|---|
| 258 | return self.__bytefile
|
|---|
| 259 |
|
|---|
| 260 | def setCharacterStream(self, charfile):
|
|---|
| 261 | """Set the character stream for this input source. (The stream
|
|---|
| 262 | must be a Python 2.0 Unicode-wrapped file-like that performs
|
|---|
| 263 | conversion to Unicode strings.)
|
|---|
| 264 |
|
|---|
| 265 | If there is a character stream specified, the SAX parser will
|
|---|
| 266 | ignore any byte stream and will not attempt to open a URI
|
|---|
| 267 | connection to the system identifier."""
|
|---|
| 268 | self.__charfile = charfile
|
|---|
| 269 |
|
|---|
| 270 | def getCharacterStream(self):
|
|---|
| 271 | "Get the character stream for this input source."
|
|---|
| 272 | return self.__charfile
|
|---|
| 273 |
|
|---|
| 274 | # ===== ATTRIBUTESIMPL =====
|
|---|
| 275 |
|
|---|
| 276 | class AttributesImpl:
|
|---|
| 277 |
|
|---|
| 278 | def __init__(self, attrs):
|
|---|
| 279 | """Non-NS-aware implementation.
|
|---|
| 280 |
|
|---|
| 281 | attrs should be of the form {name : value}."""
|
|---|
| 282 | self._attrs = attrs
|
|---|
| 283 |
|
|---|
| 284 | def getLength(self):
|
|---|
| 285 | return len(self._attrs)
|
|---|
| 286 |
|
|---|
| 287 | def getType(self, name):
|
|---|
| 288 | return "CDATA"
|
|---|
| 289 |
|
|---|
| 290 | def getValue(self, name):
|
|---|
| 291 | return self._attrs[name]
|
|---|
| 292 |
|
|---|
| 293 | def getValueByQName(self, name):
|
|---|
| 294 | return self._attrs[name]
|
|---|
| 295 |
|
|---|
| 296 | def getNameByQName(self, name):
|
|---|
| 297 | if not self._attrs.has_key(name):
|
|---|
| 298 | raise KeyError, name
|
|---|
| 299 | return name
|
|---|
| 300 |
|
|---|
| 301 | def getQNameByName(self, name):
|
|---|
| 302 | if not self._attrs.has_key(name):
|
|---|
| 303 | raise KeyError, name
|
|---|
| 304 | return name
|
|---|
| 305 |
|
|---|
| 306 | def getNames(self):
|
|---|
| 307 | return self._attrs.keys()
|
|---|
| 308 |
|
|---|
| 309 | def getQNames(self):
|
|---|
| 310 | return self._attrs.keys()
|
|---|
| 311 |
|
|---|
| 312 | def __len__(self):
|
|---|
| 313 | return len(self._attrs)
|
|---|
| 314 |
|
|---|
| 315 | def __getitem__(self, name):
|
|---|
| 316 | return self._attrs[name]
|
|---|
| 317 |
|
|---|
| 318 | def keys(self):
|
|---|
| 319 | return self._attrs.keys()
|
|---|
| 320 |
|
|---|
| 321 | def has_key(self, name):
|
|---|
| 322 | return self._attrs.has_key(name)
|
|---|
| 323 |
|
|---|
| 324 | def __contains__(self, name):
|
|---|
| 325 | return self._attrs.has_key(name)
|
|---|
| 326 |
|
|---|
| 327 | def get(self, name, alternative=None):
|
|---|
| 328 | return self._attrs.get(name, alternative)
|
|---|
| 329 |
|
|---|
| 330 | def copy(self):
|
|---|
| 331 | return self.__class__(self._attrs)
|
|---|
| 332 |
|
|---|
| 333 | def items(self):
|
|---|
| 334 | return self._attrs.items()
|
|---|
| 335 |
|
|---|
| 336 | def values(self):
|
|---|
| 337 | return self._attrs.values()
|
|---|
| 338 |
|
|---|
| 339 | # ===== ATTRIBUTESNSIMPL =====
|
|---|
| 340 |
|
|---|
| 341 | class AttributesNSImpl(AttributesImpl):
|
|---|
| 342 |
|
|---|
| 343 | def __init__(self, attrs, qnames):
|
|---|
| 344 | """NS-aware implementation.
|
|---|
| 345 |
|
|---|
| 346 | attrs should be of the form {(ns_uri, lname): value, ...}.
|
|---|
| 347 | qnames of the form {(ns_uri, lname): qname, ...}."""
|
|---|
| 348 | self._attrs = attrs
|
|---|
| 349 | self._qnames = qnames
|
|---|
| 350 |
|
|---|
| 351 | def getValueByQName(self, name):
|
|---|
| 352 | for (nsname, qname) in self._qnames.items():
|
|---|
| 353 | if qname == name:
|
|---|
| 354 | return self._attrs[nsname]
|
|---|
| 355 |
|
|---|
| 356 | raise KeyError, name
|
|---|
| 357 |
|
|---|
| 358 | def getNameByQName(self, name):
|
|---|
| 359 | for (nsname, qname) in self._qnames.items():
|
|---|
| 360 | if qname == name:
|
|---|
| 361 | return nsname
|
|---|
| 362 |
|
|---|
| 363 | raise KeyError, name
|
|---|
| 364 |
|
|---|
| 365 | def getQNameByName(self, name):
|
|---|
| 366 | return self._qnames[name]
|
|---|
| 367 |
|
|---|
| 368 | def getQNames(self):
|
|---|
| 369 | return self._qnames.values()
|
|---|
| 370 |
|
|---|
| 371 | def copy(self):
|
|---|
| 372 | return self.__class__(self._attrs, self._qnames)
|
|---|
| 373 |
|
|---|
| 374 |
|
|---|
| 375 | def _test():
|
|---|
| 376 | XMLReader()
|
|---|
| 377 | IncrementalParser()
|
|---|
| 378 | Locator()
|
|---|
| 379 |
|
|---|
| 380 | if __name__ == "__main__":
|
|---|
| 381 | _test()
|
|---|