source: trunk/essentials/dev-lang/python/Doc/tools/sgmlconv/docfixer.py

Last change on this file was 3225, checked in by bird, 19 years ago

Python 2.5

File size: 36.4 KB
Line 
1#! /usr/bin/env python
2
3"""Perform massive transformations on a document tree created from the LaTeX
4of the Python documentation, and dump the ESIS data for the transformed tree.
5"""
6
7
8import errno
9import esistools
10import re
11import sys
12import xml.dom
13import xml.dom.minidom
14
15ELEMENT = xml.dom.Node.ELEMENT_NODE
16ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE
17TEXT = xml.dom.Node.TEXT_NODE
18
19
20class ConversionError(Exception):
21 pass
22
23
24ewrite = sys.stderr.write
25try:
26 # We can only do this trick on Unix (if tput is on $PATH)!
27 if sys.platform != "posix" or not sys.stderr.isatty():
28 raise ImportError
29 import commands
30except ImportError:
31 bwrite = ewrite
32else:
33 def bwrite(s, BOLDON=commands.getoutput("tput bold"),
34 BOLDOFF=commands.getoutput("tput sgr0")):
35 ewrite("%s%s%s" % (BOLDON, s, BOLDOFF))
36
37
38PARA_ELEMENT = "para"
39
40DEBUG_PARA_FIXER = 0
41
42if DEBUG_PARA_FIXER:
43 def para_msg(s):
44 ewrite("*** %s\n" % s)
45else:
46 def para_msg(s):
47 pass
48
49
50def get_first_element(doc, gi):
51 for n in doc.childNodes:
52 if n.nodeName == gi:
53 return n
54
55def extract_first_element(doc, gi):
56 node = get_first_element(doc, gi)
57 if node is not None:
58 doc.removeChild(node)
59 return node
60
61
62def get_documentElement(node):
63 result = None
64 for child in node.childNodes:
65 if child.nodeType == ELEMENT:
66 result = child
67 return result
68
69
70def set_tagName(elem, gi):
71 elem.nodeName = elem.tagName = gi
72
73
74def find_all_elements(doc, gi):
75 nodes = []
76 if doc.nodeName == gi:
77 nodes.append(doc)
78 for child in doc.childNodes:
79 if child.nodeType == ELEMENT:
80 if child.tagName == gi:
81 nodes.append(child)
82 for node in child.getElementsByTagName(gi):
83 nodes.append(node)
84 return nodes
85
86def find_all_child_elements(doc, gi):
87 nodes = []
88 for child in doc.childNodes:
89 if child.nodeName == gi:
90 nodes.append(child)
91 return nodes
92
93
94def find_all_elements_from_set(doc, gi_set):
95 return __find_all_elements_from_set(doc, gi_set, [])
96
97def __find_all_elements_from_set(doc, gi_set, nodes):
98 if doc.nodeName in gi_set:
99 nodes.append(doc)
100 for child in doc.childNodes:
101 if child.nodeType == ELEMENT:
102 __find_all_elements_from_set(child, gi_set, nodes)
103 return nodes
104
105
106def simplify(doc, fragment):
107 # Try to rationalize the document a bit, since these things are simply
108 # not valid SGML/XML documents as they stand, and need a little work.
109 documentclass = "document"
110 inputs = []
111 node = extract_first_element(fragment, "documentclass")
112 if node is not None:
113 documentclass = node.getAttribute("classname")
114 node = extract_first_element(fragment, "title")
115 if node is not None:
116 inputs.append(node)
117 # update the name of the root element
118 node = get_first_element(fragment, "document")
119 if node is not None:
120 set_tagName(node, documentclass)
121 # Move everything that comes before this node into this node;
122 # this will be the document element.
123 nodelist = fragment.childNodes
124 point = node.firstChild
125 while not nodelist[0].isSameNode(node):
126 node.insertBefore(nodelist[0], point)
127 while 1:
128 node = extract_first_element(fragment, "input")
129 if node is None:
130 break
131 inputs.append(node)
132 if inputs:
133 docelem = get_documentElement(fragment)
134 inputs.reverse()
135 for node in inputs:
136 text = doc.createTextNode("\n")
137 docelem.insertBefore(text, docelem.firstChild)
138 docelem.insertBefore(node, text)
139 docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
140 while fragment.firstChild and fragment.firstChild.nodeType == TEXT:
141 fragment.removeChild(fragment.firstChild)
142
143
144def cleanup_root_text(doc):
145 discards = []
146 skip = 0
147 for n in doc.childNodes:
148 prevskip = skip
149 skip = 0
150 if n.nodeType == TEXT and not prevskip:
151 discards.append(n)
152 elif n.nodeName == "COMMENT":
153 skip = 1
154 for node in discards:
155 doc.removeChild(node)
156
157
158DESCRIPTOR_ELEMENTS = (
159 "cfuncdesc", "cvardesc", "ctypedesc",
160 "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni",
161 "excdesc", "funcdesc", "funcdescni", "opcodedesc",
162 "datadesc", "datadescni",
163 )
164
165def fixup_descriptors(doc, fragment):
166 sections = find_all_elements(fragment, "section")
167 for section in sections:
168 find_and_fix_descriptors(doc, section)
169
170
171def find_and_fix_descriptors(doc, container):
172 children = container.childNodes
173 for child in children:
174 if child.nodeType == ELEMENT:
175 tagName = child.tagName
176 if tagName in DESCRIPTOR_ELEMENTS:
177 rewrite_descriptor(doc, child)
178 elif tagName == "subsection":
179 find_and_fix_descriptors(doc, child)
180
181
182def rewrite_descriptor(doc, descriptor):
183 #
184 # Do these things:
185 # 1. Add an "index='no'" attribute to the element if the tagName
186 # ends in 'ni', removing the 'ni' from the name.
187 # 2. Create a <signature> from the name attribute
188 # 2a.Create an <args> if it appears to be available.
189 # 3. Create additional <signature>s from <*line{,ni}> elements,
190 # if found.
191 # 4. If a <versionadded> is found, move it to an attribute on the
192 # descriptor.
193 # 5. Move remaining child nodes to a <description> element.
194 # 6. Put it back together.
195 #
196 # 1.
197 descname = descriptor.tagName
198 index = descriptor.getAttribute("name") != "no"
199 desctype = descname[:-4] # remove 'desc'
200 linename = desctype + "line"
201 if not index:
202 linename = linename + "ni"
203 # 2.
204 signature = doc.createElement("signature")
205 name = doc.createElement("name")
206 signature.appendChild(doc.createTextNode("\n "))
207 signature.appendChild(name)
208 name.appendChild(doc.createTextNode(descriptor.getAttribute("name")))
209 descriptor.removeAttribute("name")
210 # 2a.
211 if descriptor.hasAttribute("var"):
212 if descname != "opcodedesc":
213 raise RuntimeError, \
214 "got 'var' attribute on descriptor other than opcodedesc"
215 variable = descriptor.getAttribute("var")
216 if variable:
217 args = doc.createElement("args")
218 args.appendChild(doc.createTextNode(variable))
219 signature.appendChild(doc.createTextNode("\n "))
220 signature.appendChild(args)
221 descriptor.removeAttribute("var")
222 newchildren = [signature]
223 children = descriptor.childNodes
224 pos = skip_leading_nodes(children)
225 if pos < len(children):
226 child = children[pos]
227 if child.nodeName == "args":
228 # move <args> to <signature>, or remove if empty:
229 child.parentNode.removeChild(child)
230 if len(child.childNodes):
231 signature.appendChild(doc.createTextNode("\n "))
232 signature.appendChild(child)
233 signature.appendChild(doc.createTextNode("\n "))
234 # 3, 4.
235 pos = skip_leading_nodes(children, pos)
236 while pos < len(children) \
237 and children[pos].nodeName in (linename, "versionadded"):
238 if children[pos].tagName == linename:
239 # this is really a supplemental signature, create <signature>
240 oldchild = children[pos].cloneNode(1)
241 try:
242 sig = methodline_to_signature(doc, children[pos])
243 except KeyError:
244 print oldchild.toxml()
245 raise
246 newchildren.append(sig)
247 else:
248 # <versionadded added=...>
249 descriptor.setAttribute(
250 "added", children[pos].getAttribute("version"))
251 pos = skip_leading_nodes(children, pos + 1)
252 # 5.
253 description = doc.createElement("description")
254 description.appendChild(doc.createTextNode("\n"))
255 newchildren.append(description)
256 move_children(descriptor, description, pos)
257 last = description.childNodes[-1]
258 if last.nodeType == TEXT:
259 last.data = last.data.rstrip() + "\n "
260 # 6.
261 # should have nothing but whitespace and signature lines in <descriptor>;
262 # discard them
263 while descriptor.childNodes:
264 descriptor.removeChild(descriptor.childNodes[0])
265 for node in newchildren:
266 descriptor.appendChild(doc.createTextNode("\n "))
267 descriptor.appendChild(node)
268 descriptor.appendChild(doc.createTextNode("\n"))
269
270
271def methodline_to_signature(doc, methodline):
272 signature = doc.createElement("signature")
273 signature.appendChild(doc.createTextNode("\n "))
274 name = doc.createElement("name")
275 name.appendChild(doc.createTextNode(methodline.getAttribute("name")))
276 methodline.removeAttribute("name")
277 signature.appendChild(name)
278 if len(methodline.childNodes):
279 args = doc.createElement("args")
280 signature.appendChild(doc.createTextNode("\n "))
281 signature.appendChild(args)
282 move_children(methodline, args)
283 signature.appendChild(doc.createTextNode("\n "))
284 return signature
285
286
287def move_children(origin, dest, start=0):
288 children = origin.childNodes
289 while start < len(children):
290 node = children[start]
291 origin.removeChild(node)
292 dest.appendChild(node)
293
294
295def handle_appendix(doc, fragment):
296 # must be called after simplfy() if document is multi-rooted to begin with
297 docelem = get_documentElement(fragment)
298 toplevel = docelem.tagName == "manual" and "chapter" or "section"
299 appendices = 0
300 nodes = []
301 for node in docelem.childNodes:
302 if appendices:
303 nodes.append(node)
304 elif node.nodeType == ELEMENT:
305 appnodes = node.getElementsByTagName("appendix")
306 if appnodes:
307 appendices = 1
308 parent = appnodes[0].parentNode
309 parent.removeChild(appnodes[0])
310 parent.normalize()
311 if nodes:
312 map(docelem.removeChild, nodes)
313 docelem.appendChild(doc.createTextNode("\n\n\n"))
314 back = doc.createElement("back-matter")
315 docelem.appendChild(back)
316 back.appendChild(doc.createTextNode("\n"))
317 while nodes and nodes[0].nodeType == TEXT \
318 and not nodes[0].data.strip():
319 del nodes[0]
320 map(back.appendChild, nodes)
321 docelem.appendChild(doc.createTextNode("\n"))
322
323
324def handle_labels(doc, fragment):
325 for label in find_all_elements(fragment, "label"):
326 id = label.getAttribute("id")
327 if not id:
328 continue
329 parent = label.parentNode
330 parentTagName = parent.tagName
331 if parentTagName == "title":
332 parent.parentNode.setAttribute("id", id)
333 else:
334 parent.setAttribute("id", id)
335 # now, remove <label id="..."/> from parent:
336 parent.removeChild(label)
337 if parentTagName == "title":
338 parent.normalize()
339 children = parent.childNodes
340 if children[-1].nodeType == TEXT:
341 children[-1].data = children[-1].data.rstrip()
342
343
344def fixup_trailing_whitespace(doc, fragment, wsmap):
345 queue = [fragment]
346 fixups = []
347 while queue:
348 node = queue[0]
349 del queue[0]
350 if wsmap.has_key(node.nodeName):
351 fixups.append(node)
352 for child in node.childNodes:
353 if child.nodeType == ELEMENT:
354 queue.append(child)
355
356 # reverse the list to process from the inside out
357 fixups.reverse()
358 for node in fixups:
359 node.parentNode.normalize()
360 lastchild = node.lastChild
361 before, after = wsmap[node.tagName]
362 if lastchild.nodeType == TEXT:
363 data = lastchild.data.rstrip() + before
364 lastchild.data = data
365 norm = 0
366 if wsmap[node.tagName]:
367 nextnode = node.nextSibling
368 if nextnode and nextnode.nodeType == TEXT:
369 nextnode.data = after + nextnode.data.lstrip()
370 else:
371 wsnode = doc.createTextNode(after)
372 node.parentNode.insertBefore(wsnode, nextnode)
373 # hack to get the title in place:
374 if node.tagName == "title" \
375 and node.parentNode.firstChild.nodeType == ELEMENT:
376 node.parentNode.insertBefore(doc.createTextNode("\n "),
377 node.parentNode.firstChild)
378 node.parentNode.normalize()
379
380
381def normalize(doc):
382 for node in doc.childNodes:
383 if node.nodeType == ELEMENT:
384 node.normalize()
385
386
387def cleanup_trailing_parens(doc, element_names):
388 d = {}
389 for gi in element_names:
390 d[gi] = gi
391 rewrite_element = d.has_key
392 queue = [node for node in doc.childNodes if node.nodeType == ELEMENT]
393 while queue:
394 node = queue[0]
395 del queue[0]
396 if rewrite_element(node.tagName):
397 lastchild = node.lastChild
398 if lastchild and lastchild.nodeType == TEXT:
399 data = lastchild.data
400 if data.endswith("()"):
401 lastchild.data = data[:-2]
402 else:
403 for child in node.childNodes:
404 if child.nodeType == ELEMENT:
405 queue.append(child)
406
407
408def contents_match(left, right):
409 left_children = left.childNodes
410 right_children = right.childNodes
411 if len(left_children) != len(right_children):
412 return 0
413 for l, r in map(None, left_children, right_children):
414 nodeType = l.nodeType
415 if nodeType != r.nodeType:
416 return 0
417 if nodeType == ELEMENT:
418 if l.tagName != r.tagName:
419 return 0
420 # should check attributes, but that's not a problem here
421 if not contents_match(l, r):
422 return 0
423 elif nodeType == TEXT:
424 if l.data != r.data:
425 return 0
426 else:
427 # not quite right, but good enough
428 return 0
429 return 1
430
431
432def create_module_info(doc, section):
433 # Heavy.
434 node = extract_first_element(section, "modulesynopsis")
435 if node is None:
436 return
437 set_tagName(node, "synopsis")
438 lastchild = node.childNodes[-1]
439 if lastchild.nodeType == TEXT \
440 and lastchild.data[-1:] == ".":
441 lastchild.data = lastchild.data[:-1]
442 modauthor = extract_first_element(section, "moduleauthor")
443 if modauthor:
444 set_tagName(modauthor, "author")
445 modauthor.appendChild(doc.createTextNode(
446 modauthor.getAttribute("name")))
447 modauthor.removeAttribute("name")
448 platform = extract_first_element(section, "platform")
449 if section.tagName == "section":
450 modinfo_pos = 2
451 modinfo = doc.createElement("moduleinfo")
452 moddecl = extract_first_element(section, "declaremodule")
453 name = None
454 if moddecl:
455 modinfo.appendChild(doc.createTextNode("\n "))
456 name = moddecl.attributes["name"].value
457 namenode = doc.createElement("name")
458 namenode.appendChild(doc.createTextNode(name))
459 modinfo.appendChild(namenode)
460 type = moddecl.attributes.get("type")
461 if type:
462 type = type.value
463 modinfo.appendChild(doc.createTextNode("\n "))
464 typenode = doc.createElement("type")
465 typenode.appendChild(doc.createTextNode(type))
466 modinfo.appendChild(typenode)
467 versionadded = extract_first_element(section, "versionadded")
468 if versionadded:
469 modinfo.setAttribute("added", versionadded.getAttribute("version"))
470 title = get_first_element(section, "title")
471 if title:
472 children = title.childNodes
473 if len(children) >= 2 \
474 and children[0].nodeName == "module" \
475 and children[0].childNodes[0].data == name:
476 # this is it; morph the <title> into <short-synopsis>
477 first_data = children[1]
478 if first_data.data[:4] == " ---":
479 first_data.data = first_data.data[4:].lstrip()
480 set_tagName(title, "short-synopsis")
481 if children[-1].nodeType == TEXT \
482 and children[-1].data[-1:] == ".":
483 children[-1].data = children[-1].data[:-1]
484 section.removeChild(title)
485 section.removeChild(section.childNodes[0])
486 title.removeChild(children[0])
487 modinfo_pos = 0
488 else:
489 ewrite("module name in title doesn't match"
490 " <declaremodule/>; no <short-synopsis/>\n")
491 else:
492 ewrite("Unexpected condition: <section/> without <title/>\n")
493 modinfo.appendChild(doc.createTextNode("\n "))
494 modinfo.appendChild(node)
495 if title and not contents_match(title, node):
496 # The short synopsis is actually different,
497 # and needs to be stored:
498 modinfo.appendChild(doc.createTextNode("\n "))
499 modinfo.appendChild(title)
500 if modauthor:
501 modinfo.appendChild(doc.createTextNode("\n "))
502 modinfo.appendChild(modauthor)
503 if platform:
504 modinfo.appendChild(doc.createTextNode("\n "))
505 modinfo.appendChild(platform)
506 modinfo.appendChild(doc.createTextNode("\n "))
507 section.insertBefore(modinfo, section.childNodes[modinfo_pos])
508 section.insertBefore(doc.createTextNode("\n "), modinfo)
509 #
510 # The rest of this removes extra newlines from where we cut out
511 # a lot of elements. A lot of code for minimal value, but keeps
512 # keeps the generated *ML from being too funny looking.
513 #
514 section.normalize()
515 children = section.childNodes
516 for i in range(len(children)):
517 node = children[i]
518 if node.nodeName == "moduleinfo":
519 nextnode = children[i+1]
520 if nextnode.nodeType == TEXT:
521 data = nextnode.data
522 s = data.lstrip()
523 if len(s) < (len(data) - 4):
524 nextnode.data = "\n\n\n" + s
525
526
527def cleanup_synopses(doc, fragment):
528 for node in find_all_elements(fragment, "section"):
529 create_module_info(doc, node)
530
531
532def fixup_table_structures(doc, fragment):
533 for table in find_all_elements(fragment, "table"):
534 fixup_table(doc, table)
535
536
537def fixup_table(doc, table):
538 # create the table head
539 thead = doc.createElement("thead")
540 row = doc.createElement("row")
541 move_elements_by_name(doc, table, row, "entry")
542 thead.appendChild(doc.createTextNode("\n "))
543 thead.appendChild(row)
544 thead.appendChild(doc.createTextNode("\n "))
545 # create the table body
546 tbody = doc.createElement("tbody")
547 prev_row = None
548 last_was_hline = 0
549 children = table.childNodes
550 for child in children:
551 if child.nodeType == ELEMENT:
552 tagName = child.tagName
553 if tagName == "hline" and prev_row is not None:
554 prev_row.setAttribute("rowsep", "1")
555 elif tagName == "row":
556 prev_row = child
557 # save the rows:
558 tbody.appendChild(doc.createTextNode("\n "))
559 move_elements_by_name(doc, table, tbody, "row", sep="\n ")
560 # and toss the rest:
561 while children:
562 child = children[0]
563 nodeType = child.nodeType
564 if nodeType == TEXT:
565 if child.data.strip():
566 raise ConversionError("unexpected free data in <%s>: %r"
567 % (table.tagName, child.data))
568 table.removeChild(child)
569 continue
570 if nodeType == ELEMENT:
571 if child.tagName != "hline":
572 raise ConversionError(
573 "unexpected <%s> in table" % child.tagName)
574 table.removeChild(child)
575 continue
576 raise ConversionError(
577 "unexpected %s node in table" % child.__class__.__name__)
578 # nothing left in the <table>; add the <thead> and <tbody>
579 tgroup = doc.createElement("tgroup")
580 tgroup.appendChild(doc.createTextNode("\n "))
581 tgroup.appendChild(thead)
582 tgroup.appendChild(doc.createTextNode("\n "))
583 tgroup.appendChild(tbody)
584 tgroup.appendChild(doc.createTextNode("\n "))
585 table.appendChild(tgroup)
586 # now make the <entry>s look nice:
587 for row in table.getElementsByTagName("row"):
588 fixup_row(doc, row)
589
590
591def fixup_row(doc, row):
592 entries = []
593 map(entries.append, row.childNodes[1:])
594 for entry in entries:
595 row.insertBefore(doc.createTextNode("\n "), entry)
596# row.appendChild(doc.createTextNode("\n "))
597
598
599def move_elements_by_name(doc, source, dest, name, sep=None):
600 nodes = []
601 for child in source.childNodes:
602 if child.nodeName == name:
603 nodes.append(child)
604 for node in nodes:
605 source.removeChild(node)
606 dest.appendChild(node)
607 if sep:
608 dest.appendChild(doc.createTextNode(sep))
609
610
611RECURSE_INTO_PARA_CONTAINERS = (
612 "chapter", "abstract", "enumerate",
613 "section", "subsection", "subsubsection",
614 "paragraph", "subparagraph", "back-matter",
615 "howto", "manual",
616 "item", "itemize", "fulllineitems", "enumeration", "descriptionlist",
617 "definitionlist", "definition",
618 )
619
620PARA_LEVEL_ELEMENTS = (
621 "moduleinfo", "title", "verbatim", "enumerate", "item",
622 "interpreter-session", "back-matter", "interactive-session",
623 "opcodedesc", "classdesc", "datadesc",
624 "cfuncdesc", "ctypedesc", "cvardesc",
625 "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni",
626 "funcdescni", "methoddescni", "excdescni",
627 "tableii", "tableiii", "tableiv", "localmoduletable",
628 "sectionauthor", "seealso", "itemize",
629 # include <para>, so we can just do it again to get subsequent paras:
630 PARA_ELEMENT,
631 )
632
633PARA_LEVEL_PRECEEDERS = (
634 "setindexsubitem", "author",
635 "stindex", "obindex", "COMMENT", "label", "xi:include", "title",
636 "versionadded", "versionchanged", "declaremodule", "modulesynopsis",
637 "moduleauthor", "indexterm", "leader",
638 )
639
640
641def fixup_paras(doc, fragment):
642 for child in fragment.childNodes:
643 if child.nodeName in RECURSE_INTO_PARA_CONTAINERS:
644 fixup_paras_helper(doc, child)
645 descriptions = find_all_elements(fragment, "description")
646 for description in descriptions:
647 fixup_paras_helper(doc, description)
648
649
650def fixup_paras_helper(doc, container, depth=0):
651 # document is already normalized
652 children = container.childNodes
653 start = skip_leading_nodes(children)
654 while len(children) > start:
655 if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS:
656 # Something to recurse into:
657 fixup_paras_helper(doc, children[start])
658 else:
659 # Paragraph material:
660 build_para(doc, container, start, len(children))
661 if DEBUG_PARA_FIXER and depth == 10:
662 sys.exit(1)
663 start = skip_leading_nodes(children, start + 1)
664
665
666def build_para(doc, parent, start, i):
667 children = parent.childNodes
668 after = start + 1
669 have_last = 0
670 BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS
671 # Collect all children until \n\n+ is found in a text node or a
672 # member of BREAK_ELEMENTS is found.
673 for j in range(start, i):
674 after = j + 1
675 child = children[j]
676 nodeType = child.nodeType
677 if nodeType == ELEMENT:
678 if child.tagName in BREAK_ELEMENTS:
679 after = j
680 break
681 elif nodeType == TEXT:
682 pos = child.data.find("\n\n")
683 if pos == 0:
684 after = j
685 break
686 if pos >= 1:
687 child.splitText(pos)
688 break
689 else:
690 have_last = 1
691 if (start + 1) > after:
692 raise ConversionError(
693 "build_para() could not identify content to turn into a paragraph")
694 if children[after - 1].nodeType == TEXT:
695 # we may need to split off trailing white space:
696 child = children[after - 1]
697 data = child.data
698 if data.rstrip() != data:
699 have_last = 0
700 child.splitText(len(data.rstrip()))
701 para = doc.createElement(PARA_ELEMENT)
702 prev = None
703 indexes = range(start, after)
704 indexes.reverse()
705 for j in indexes:
706 node = parent.childNodes[j]
707 parent.removeChild(node)
708 para.insertBefore(node, prev)
709 prev = node
710 if have_last:
711 parent.appendChild(para)
712 parent.appendChild(doc.createTextNode("\n\n"))
713 return len(parent.childNodes)
714 else:
715 nextnode = parent.childNodes[start]
716 if nextnode.nodeType == TEXT:
717 if nextnode.data and nextnode.data[0] != "\n":
718 nextnode.data = "\n" + nextnode.data
719 else:
720 newnode = doc.createTextNode("\n")
721 parent.insertBefore(newnode, nextnode)
722 nextnode = newnode
723 start = start + 1
724 parent.insertBefore(para, nextnode)
725 return start + 1
726
727
728def skip_leading_nodes(children, start=0):
729 """Return index into children of a node at which paragraph building should
730 begin or a recursive call to fixup_paras_helper() should be made (for
731 subsections, etc.).
732
733 When the return value >= len(children), we've built all the paras we can
734 from this list of children.
735 """
736 i = len(children)
737 while i > start:
738 # skip over leading comments and whitespace:
739 child = children[start]
740 nodeType = child.nodeType
741 if nodeType == TEXT:
742 data = child.data
743 shortened = data.lstrip()
744 if shortened:
745 if data != shortened:
746 # break into two nodes: whitespace and non-whitespace
747 child.splitText(len(data) - len(shortened))
748 return start + 1
749 return start
750 # all whitespace, just skip
751 elif nodeType == ELEMENT:
752 tagName = child.tagName
753 if tagName in RECURSE_INTO_PARA_CONTAINERS:
754 return start
755 if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
756 return start
757 start = start + 1
758 return start
759
760
761def fixup_rfc_references(doc, fragment):
762 for rfcnode in find_all_elements_from_set(fragment, ("pep", "rfc")):
763 rfcnode.appendChild(doc.createTextNode(
764 rfcnode.tagName.upper() + " " + rfcnode.getAttribute("num")))
765
766
767def fixup_signatures(doc, fragment):
768 for child in fragment.childNodes:
769 if child.nodeType == ELEMENT:
770 args = child.getElementsByTagName("args")
771 for arg in args:
772 rewrite_args(doc, arg)
773 args = child.getElementsByTagName("constructor-args")
774 for arg in args:
775 rewrite_args(doc, arg)
776
777def rewrite_args(doc, arglist):
778 fixup_args(doc, arglist)
779 arglist.normalize()
780 if arglist.childNodes.length == 1 and arglist.firstChild.nodeType == TEXT:
781 node = arglist.firstChild
782 node.data = ' '.join(node.data.split())
783
784def fixup_args(doc, arglist):
785 for child in arglist.childNodes:
786 if child.nodeName == "optional":
787 # found it; fix and return
788 arglist.insertBefore(doc.createTextNode("["), child)
789 optkids = child.childNodes
790 while optkids:
791 arglist.insertBefore(child.firstChild, child)
792 arglist.insertBefore(doc.createTextNode("]"), child)
793 arglist.removeChild(child)
794 return fixup_args(doc, arglist)
795
796
797def fixup_sectionauthors(doc, fragment):
798 for sectauth in find_all_elements(fragment, "sectionauthor"):
799 section = sectauth.parentNode
800 section.removeChild(sectauth)
801 set_tagName(sectauth, "author")
802 sectauth.appendChild(doc.createTextNode(
803 sectauth.getAttribute("name")))
804 sectauth.removeAttribute("name")
805 after = section.childNodes[2]
806 title = section.childNodes[1]
807 if title.nodeName != "title":
808 after = section.childNodes[0]
809 section.insertBefore(doc.createTextNode("\n "), after)
810 section.insertBefore(sectauth, after)
811
812
813def fixup_verbatims(doc):
814 for verbatim in find_all_elements(doc, "verbatim"):
815 child = verbatim.childNodes[0]
816 if child.nodeType == TEXT \
817 and child.data.lstrip().startswith(">>>"):
818 set_tagName(verbatim, "interactive-session")
819
820
821def add_node_ids(fragment, counter=0):
822 fragment.node_id = counter
823 for node in fragment.childNodes:
824 counter = counter + 1
825 if node.nodeType == ELEMENT:
826 counter = add_node_ids(node, counter)
827 else:
828 node.node_id = counter
829 return counter + 1
830
831
832def fixup_ulink(doc, fragment):
833 for ulink in find_all_elements(fragment, "ulink"):
834 children = ulink.childNodes
835 assert len(children) == 2
836 text = children[0]
837 href = children[1]
838 href.normalize()
839 assert len(href.childNodes) == 1
840 assert href.childNodes[0].nodeType == TEXT
841 url = href.childNodes[0].data
842 ulink.setAttribute("href", url)
843 ulink.removeChild(href)
844 content = text.childNodes
845 while len(content):
846 ulink.appendChild(content[0])
847 ulink.removeChild(text)
848
849
850REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex',
851 'refexmodindex', 'refstmodindex')
852
853def fixup_refmodindexes(fragment):
854 # Locate <ref*modindex>...</> co-located with <module>...</>, and
855 # remove the <ref*modindex>, replacing it with index=index on the
856 # <module> element.
857 nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS)
858 d = {}
859 for node in nodes:
860 parent = node.parentNode
861 d[parent.node_id] = parent
862 del nodes
863 map(fixup_refmodindexes_chunk, d.values())
864
865
866def fixup_refmodindexes_chunk(container):
867 # node is probably a <para>; let's see how often it isn't:
868 if container.tagName != PARA_ELEMENT:
869 bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container)
870 module_entries = find_all_elements(container, "module")
871 if not module_entries:
872 return
873 index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS)
874 removes = []
875 for entry in index_entries:
876 children = entry.childNodes
877 if len(children) != 0:
878 bwrite("--- unexpected number of children for %s node:\n"
879 % entry.tagName)
880 ewrite(entry.toxml() + "\n")
881 continue
882 found = 0
883 module_name = entry.getAttribute("module")
884 for node in module_entries:
885 if len(node.childNodes) != 1:
886 continue
887 this_name = node.childNodes[0].data
888 if this_name == module_name:
889 found = 1
890 node.setAttribute("index", "yes")
891 if found:
892 removes.append(entry)
893 for node in removes:
894 container.removeChild(node)
895
896
897def fixup_bifuncindexes(fragment):
898 nodes = find_all_elements(fragment, 'bifuncindex')
899 d = {}
900 # make sure that each parent is only processed once:
901 for node in nodes:
902 parent = node.parentNode
903 d[parent.node_id] = parent
904 del nodes
905 map(fixup_bifuncindexes_chunk, d.values())
906
907
908def fixup_bifuncindexes_chunk(container):
909 removes = []
910 entries = find_all_child_elements(container, "bifuncindex")
911 function_entries = find_all_child_elements(container, "function")
912 for entry in entries:
913 function_name = entry.getAttribute("name")
914 found = 0
915 for func_entry in function_entries:
916 t2 = func_entry.childNodes[0].data
917 if t2[-2:] != "()":
918 continue
919 t2 = t2[:-2]
920 if t2 == function_name:
921 func_entry.setAttribute("index", "yes")
922 func_entry.setAttribute("module", "__builtin__")
923 if not found:
924 found = 1
925 removes.append(entry)
926 for entry in removes:
927 container.removeChild(entry)
928
929
930def join_adjacent_elements(container, gi):
931 queue = [container]
932 while queue:
933 parent = queue.pop()
934 i = 0
935 children = parent.childNodes
936 nchildren = len(children)
937 while i < (nchildren - 1):
938 child = children[i]
939 if child.nodeName == gi:
940 if children[i+1].nodeName == gi:
941 ewrite("--- merging two <%s/> elements\n" % gi)
942 child = children[i]
943 nextchild = children[i+1]
944 nextchildren = nextchild.childNodes
945 while len(nextchildren):
946 node = nextchildren[0]
947 nextchild.removeChild(node)
948 child.appendChild(node)
949 parent.removeChild(nextchild)
950 continue
951 if child.nodeType == ELEMENT:
952 queue.append(child)
953 i = i + 1
954
955
956_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
957
958def write_esis(doc, ofp, knownempty):
959 for node in doc.childNodes:
960 nodeType = node.nodeType
961 if nodeType == ELEMENT:
962 gi = node.tagName
963 if knownempty(gi):
964 if node.hasChildNodes():
965 raise ValueError, \
966 "declared-empty node <%s> has children" % gi
967 ofp.write("e\n")
968 for k, value in node.attributes.items():
969 if _token_rx.match(value):
970 dtype = "TOKEN"
971 else:
972 dtype = "CDATA"
973 ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
974 ofp.write("(%s\n" % gi)
975 write_esis(node, ofp, knownempty)
976 ofp.write(")%s\n" % gi)
977 elif nodeType == TEXT:
978 ofp.write("-%s\n" % esistools.encode(node.data))
979 elif nodeType == ENTITY_REFERENCE:
980 ofp.write("&%s\n" % node.nodeName)
981 else:
982 raise RuntimeError, "unsupported node type: %s" % nodeType
983
984
985def convert(ifp, ofp):
986 events = esistools.parse(ifp)
987 toktype, doc = events.getEvent()
988 fragment = doc.createDocumentFragment()
989 events.expandNode(fragment)
990
991 normalize(fragment)
992 simplify(doc, fragment)
993 handle_labels(doc, fragment)
994 handle_appendix(doc, fragment)
995 fixup_trailing_whitespace(doc, fragment, {
996 # element -> (before-end-tag, after-end-tag)
997 "abstract": ("\n", "\n"),
998 "title": ("", "\n"),
999 "chapter": ("\n", "\n\n\n"),
1000 "section": ("\n", "\n\n\n"),
1001 "subsection": ("\n", "\n\n"),
1002 "subsubsection": ("\n", "\n\n"),
1003 "paragraph": ("\n", "\n\n"),
1004 "subparagraph": ("\n", "\n\n"),
1005 "description": ("\n", "\n\n"),
1006 "enumeration": ("\n", "\n\n"),
1007 "item": ("\n", "\n\n"),
1008 })
1009 cleanup_root_text(doc)
1010 cleanup_trailing_parens(fragment, ["function", "method", "cfunction"])
1011 cleanup_synopses(doc, fragment)
1012 fixup_descriptors(doc, fragment)
1013 fixup_verbatims(fragment)
1014 normalize(fragment)
1015 fixup_paras(doc, fragment)
1016 fixup_sectionauthors(doc, fragment)
1017 fixup_table_structures(doc, fragment)
1018 fixup_rfc_references(doc, fragment)
1019 fixup_signatures(doc, fragment)
1020 fixup_ulink(doc, fragment)
1021 add_node_ids(fragment)
1022 fixup_refmodindexes(fragment)
1023 fixup_bifuncindexes(fragment)
1024 # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and
1025 # LaTeX2HTML screwing with GNU-style long options (the '--' problem).
1026 join_adjacent_elements(fragment, "option")
1027 # Attempt to avoid trailing blank lines:
1028 fragment.normalize()
1029 if fragment.lastChild.data[-1:] == "\n":
1030 fragment.lastChild.data = fragment.lastChild.data.rstrip() + "\n"
1031 #
1032 d = {}
1033 for gi in events.parser.get_empties():
1034 d[gi] = gi
1035 for key in ("author", "pep", "rfc"):
1036 if d.has_key(key):
1037 del d[key]
1038 knownempty = d.has_key
1039 #
1040 try:
1041 write_esis(fragment, ofp, knownempty)
1042 except IOError, (err, msg):
1043 # Ignore EPIPE; it just means that whoever we're writing to stopped
1044 # reading. The rest of the output would be ignored. All other errors
1045 # should still be reported,
1046 if err != errno.EPIPE:
1047 raise
1048
1049
1050def main():
1051 if len(sys.argv) == 1:
1052 ifp = sys.stdin
1053 ofp = sys.stdout
1054 elif len(sys.argv) == 2:
1055 ifp = open(sys.argv[1])
1056 ofp = sys.stdout
1057 elif len(sys.argv) == 3:
1058 ifp = open(sys.argv[1])
1059 import StringIO
1060 ofp = StringIO.StringIO()
1061 else:
1062 usage()
1063 sys.exit(2)
1064 convert(ifp, ofp)
1065 if len(sys.argv) == 3:
1066 fp = open(sys.argv[2], "w")
1067 fp.write(ofp.getvalue())
1068 fp.close()
1069 ofp.close()
1070
1071
1072if __name__ == "__main__":
1073 main()
Note: See TracBrowser for help on using the repository browser.