Context Navigation

docfixer.py

Visit:

Last change on this file was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 36.4 KB

Line
1	#! /usr/bin/env python
2
3	"""Perform massive transformations on a document tree created from the LaTeX
4	of the Python documentation, and dump the ESIS data for the transformed tree.
5	"""
6
7
8	import errno
9	import esistools
10	import re
11	import sys
12	import xml.dom
13	import xml.dom.minidom
14
15	ELEMENT = xml.dom.Node.ELEMENT_NODE
16	ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE
17	TEXT = xml.dom.Node.TEXT_NODE
18
19
20	class ConversionError(Exception):
21	pass
22
23
24	ewrite = sys.stderr.write
25	try:
26	# We can only do this trick on Unix (if tput is on $PATH)!
27	if sys.platform != "posix" or not sys.stderr.isatty():
28	raise ImportError
29	import commands
30	except ImportError:
31	bwrite = ewrite
32	else:
33	def bwrite(s, BOLDON=commands.getoutput("tput bold"),
34	BOLDOFF=commands.getoutput("tput sgr0")):
35	ewrite("%s%s%s" % (BOLDON, s, BOLDOFF))
36
37
38	PARA_ELEMENT = "para"
39
40	DEBUG_PARA_FIXER = 0
41
42	if DEBUG_PARA_FIXER:
43	def para_msg(s):
44	ewrite("*** %s\n" % s)
45	else:
46	def para_msg(s):
47	pass
48
49
50	def get_first_element(doc, gi):
51	for n in doc.childNodes:
52	if n.nodeName == gi:
53	return n
54
55	def extract_first_element(doc, gi):
56	node = get_first_element(doc, gi)
57	if node is not None:
58	doc.removeChild(node)
59	return node
60
61
62	def get_documentElement(node):
63	result = None
64	for child in node.childNodes:
65	if child.nodeType == ELEMENT:
66	result = child
67	return result
68
69
70	def set_tagName(elem, gi):
71	elem.nodeName = elem.tagName = gi
72
73
74	def find_all_elements(doc, gi):
75	nodes = []
76	if doc.nodeName == gi:
77	nodes.append(doc)
78	for child in doc.childNodes:
79	if child.nodeType == ELEMENT:
80	if child.tagName == gi:
81	nodes.append(child)
82	for node in child.getElementsByTagName(gi):
83	nodes.append(node)
84	return nodes
85
86	def find_all_child_elements(doc, gi):
87	nodes = []
88	for child in doc.childNodes:
89	if child.nodeName == gi:
90	nodes.append(child)
91	return nodes
92
93
94	def find_all_elements_from_set(doc, gi_set):
95	return __find_all_elements_from_set(doc, gi_set, [])
96
97	def __find_all_elements_from_set(doc, gi_set, nodes):
98	if doc.nodeName in gi_set:
99	nodes.append(doc)
100	for child in doc.childNodes:
101	if child.nodeType == ELEMENT:
102	__find_all_elements_from_set(child, gi_set, nodes)
103	return nodes
104
105
106	def simplify(doc, fragment):
107	# Try to rationalize the document a bit, since these things are simply
108	# not valid SGML/XML documents as they stand, and need a little work.
109	documentclass = "document"
110	inputs = []
111	node = extract_first_element(fragment, "documentclass")
112	if node is not None:
113	documentclass = node.getAttribute("classname")
114	node = extract_first_element(fragment, "title")
115	if node is not None:
116	inputs.append(node)
117	# update the name of the root element
118	node = get_first_element(fragment, "document")
119	if node is not None:
120	set_tagName(node, documentclass)
121	# Move everything that comes before this node into this node;
122	# this will be the document element.
123	nodelist = fragment.childNodes
124	point = node.firstChild
125	while not nodelist[0].isSameNode(node):
126	node.insertBefore(nodelist[0], point)
127	while 1:
128	node = extract_first_element(fragment, "input")
129	if node is None:
130	break
131	inputs.append(node)
132	if inputs:
133	docelem = get_documentElement(fragment)
134	inputs.reverse()
135	for node in inputs:
136	text = doc.createTextNode("\n")
137	docelem.insertBefore(text, docelem.firstChild)
138	docelem.insertBefore(node, text)
139	docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
140	while fragment.firstChild and fragment.firstChild.nodeType == TEXT:
141	fragment.removeChild(fragment.firstChild)
142
143
144	def cleanup_root_text(doc):
145	discards = []
146	skip = 0
147	for n in doc.childNodes:
148	prevskip = skip
149	skip = 0
150	if n.nodeType == TEXT and not prevskip:
151	discards.append(n)
152	elif n.nodeName == "COMMENT":
153	skip = 1
154	for node in discards:
155	doc.removeChild(node)
156
157
158	DESCRIPTOR_ELEMENTS = (
159	"cfuncdesc", "cvardesc", "ctypedesc",
160	"classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni",
161	"excdesc", "funcdesc", "funcdescni", "opcodedesc",
162	"datadesc", "datadescni",
163	)
164
165	def fixup_descriptors(doc, fragment):
166	sections = find_all_elements(fragment, "section")
167	for section in sections:
168	find_and_fix_descriptors(doc, section)
169
170
171	def find_and_fix_descriptors(doc, container):
172	children = container.childNodes
173	for child in children:
174	if child.nodeType == ELEMENT:
175	tagName = child.tagName
176	if tagName in DESCRIPTOR_ELEMENTS:
177	rewrite_descriptor(doc, child)
178	elif tagName == "subsection":
179	find_and_fix_descriptors(doc, child)
180
181
182	def rewrite_descriptor(doc, descriptor):
183	#
184	# Do these things:
185	# 1. Add an "index='no'" attribute to the element if the tagName
186	# ends in 'ni', removing the 'ni' from the name.
187	# 2. Create a <signature> from the name attribute
188	# 2a.Create an <args> if it appears to be available.
189	# 3. Create additional <signature>s from <*line{,ni}> elements,
190	# if found.
191	# 4. If a <versionadded> is found, move it to an attribute on the
192	# descriptor.
193	# 5. Move remaining child nodes to a <description> element.
194	# 6. Put it back together.
195	#
196	# 1.
197	descname = descriptor.tagName
198	index = descriptor.getAttribute("name") != "no"
199	desctype = descname[:-4] # remove 'desc'
200	linename = desctype + "line"
201	if not index:
202	linename = linename + "ni"
203	# 2.
204	signature = doc.createElement("signature")
205	name = doc.createElement("name")
206	signature.appendChild(doc.createTextNode("\n "))
207	signature.appendChild(name)
208	name.appendChild(doc.createTextNode(descriptor.getAttribute("name")))
209	descriptor.removeAttribute("name")
210	# 2a.
211	if descriptor.hasAttribute("var"):
212	if descname != "opcodedesc":
213	raise RuntimeError, \
214	"got 'var' attribute on descriptor other than opcodedesc"
215	variable = descriptor.getAttribute("var")
216	if variable:
217	args = doc.createElement("args")
218	args.appendChild(doc.createTextNode(variable))
219	signature.appendChild(doc.createTextNode("\n "))
220	signature.appendChild(args)
221	descriptor.removeAttribute("var")
222	newchildren = [signature]
223	children = descriptor.childNodes
224	pos = skip_leading_nodes(children)
225	if pos < len(children):
226	child = children[pos]
227	if child.nodeName == "args":
228	# move <args> to <signature>, or remove if empty:
229	child.parentNode.removeChild(child)
230	if len(child.childNodes):
231	signature.appendChild(doc.createTextNode("\n "))
232	signature.appendChild(child)
233	signature.appendChild(doc.createTextNode("\n "))
234	# 3, 4.
235	pos = skip_leading_nodes(children, pos)
236	while pos < len(children) \
237	and children[pos].nodeName in (linename, "versionadded"):
238	if children[pos].tagName == linename:
239	# this is really a supplemental signature, create <signature>
240	oldchild = children[pos].cloneNode(1)
241	try:
242	sig = methodline_to_signature(doc, children[pos])
243	except KeyError:
244	print oldchild.toxml()
245	raise
246	newchildren.append(sig)
247	else:
248	# <versionadded added=...>
249	descriptor.setAttribute(
250	"added", children[pos].getAttribute("version"))
251	pos = skip_leading_nodes(children, pos + 1)
252	# 5.
253	description = doc.createElement("description")
254	description.appendChild(doc.createTextNode("\n"))
255	newchildren.append(description)
256	move_children(descriptor, description, pos)
257	last = description.childNodes[-1]
258	if last.nodeType == TEXT:
259	last.data = last.data.rstrip() + "\n "
260	# 6.
261	# should have nothing but whitespace and signature lines in <descriptor>;
262	# discard them
263	while descriptor.childNodes:
264	descriptor.removeChild(descriptor.childNodes[0])
265	for node in newchildren:
266	descriptor.appendChild(doc.createTextNode("\n "))
267	descriptor.appendChild(node)
268	descriptor.appendChild(doc.createTextNode("\n"))
269
270
271	def methodline_to_signature(doc, methodline):
272	signature = doc.createElement("signature")
273	signature.appendChild(doc.createTextNode("\n "))
274	name = doc.createElement("name")
275	name.appendChild(doc.createTextNode(methodline.getAttribute("name")))
276	methodline.removeAttribute("name")
277	signature.appendChild(name)
278	if len(methodline.childNodes):
279	args = doc.createElement("args")
280	signature.appendChild(doc.createTextNode("\n "))
281	signature.appendChild(args)
282	move_children(methodline, args)
283	signature.appendChild(doc.createTextNode("\n "))
284	return signature
285
286
287	def move_children(origin, dest, start=0):
288	children = origin.childNodes
289	while start < len(children):
290	node = children[start]
291	origin.removeChild(node)
292	dest.appendChild(node)
293
294
295	def handle_appendix(doc, fragment):
296	# must be called after simplfy() if document is multi-rooted to begin with
297	docelem = get_documentElement(fragment)
298	toplevel = docelem.tagName == "manual" and "chapter" or "section"
299	appendices = 0
300	nodes = []
301	for node in docelem.childNodes:
302	if appendices:
303	nodes.append(node)
304	elif node.nodeType == ELEMENT:
305	appnodes = node.getElementsByTagName("appendix")
306	if appnodes:
307	appendices = 1
308	parent = appnodes[0].parentNode
309	parent.removeChild(appnodes[0])
310	parent.normalize()
311	if nodes:
312	map(docelem.removeChild, nodes)
313	docelem.appendChild(doc.createTextNode("\n\n\n"))
314	back = doc.createElement("back-matter")
315	docelem.appendChild(back)
316	back.appendChild(doc.createTextNode("\n"))
317	while nodes and nodes[0].nodeType == TEXT \
318	and not nodes[0].data.strip():
319	del nodes[0]
320	map(back.appendChild, nodes)
321	docelem.appendChild(doc.createTextNode("\n"))
322
323
324	def handle_labels(doc, fragment):
325	for label in find_all_elements(fragment, "label"):
326	id = label.getAttribute("id")
327	if not id:
328	continue
329	parent = label.parentNode
330	parentTagName = parent.tagName
331	if parentTagName == "title":
332	parent.parentNode.setAttribute("id", id)
333	else:
334	parent.setAttribute("id", id)
335	# now, remove <label id="..."/> from parent:
336	parent.removeChild(label)
337	if parentTagName == "title":
338	parent.normalize()
339	children = parent.childNodes
340	if children[-1].nodeType == TEXT:
341	children[-1].data = children[-1].data.rstrip()
342
343
344	def fixup_trailing_whitespace(doc, fragment, wsmap):
345	queue = [fragment]
346	fixups = []
347	while queue:
348	node = queue[0]
349	del queue[0]
350	if wsmap.has_key(node.nodeName):
351	fixups.append(node)
352	for child in node.childNodes:
353	if child.nodeType == ELEMENT:
354	queue.append(child)
355
356	# reverse the list to process from the inside out
357	fixups.reverse()
358	for node in fixups:
359	node.parentNode.normalize()
360	lastchild = node.lastChild
361	before, after = wsmap[node.tagName]
362	if lastchild.nodeType == TEXT:
363	data = lastchild.data.rstrip() + before
364	lastchild.data = data
365	norm = 0
366	if wsmap[node.tagName]:
367	nextnode = node.nextSibling
368	if nextnode and nextnode.nodeType == TEXT:
369	nextnode.data = after + nextnode.data.lstrip()
370	else:
371	wsnode = doc.createTextNode(after)
372	node.parentNode.insertBefore(wsnode, nextnode)
373	# hack to get the title in place:
374	if node.tagName == "title" \
375	and node.parentNode.firstChild.nodeType == ELEMENT:
376	node.parentNode.insertBefore(doc.createTextNode("\n "),
377	node.parentNode.firstChild)
378	node.parentNode.normalize()
379
380
381	def normalize(doc):
382	for node in doc.childNodes:
383	if node.nodeType == ELEMENT:
384	node.normalize()
385
386
387	def cleanup_trailing_parens(doc, element_names):
388	d = {}
389	for gi in element_names:
390	d[gi] = gi
391	rewrite_element = d.has_key
392	queue = [node for node in doc.childNodes if node.nodeType == ELEMENT]
393	while queue:
394	node = queue[0]
395	del queue[0]
396	if rewrite_element(node.tagName):
397	lastchild = node.lastChild
398	if lastchild and lastchild.nodeType == TEXT:
399	data = lastchild.data
400	if data.endswith("()"):
401	lastchild.data = data[:-2]
402	else:
403	for child in node.childNodes:
404	if child.nodeType == ELEMENT:
405	queue.append(child)
406
407
408	def contents_match(left, right):
409	left_children = left.childNodes
410	right_children = right.childNodes
411	if len(left_children) != len(right_children):
412	return 0
413	for l, r in map(None, left_children, right_children):
414	nodeType = l.nodeType
415	if nodeType != r.nodeType:
416	return 0
417	if nodeType == ELEMENT:
418	if l.tagName != r.tagName:
419	return 0
420	# should check attributes, but that's not a problem here
421	if not contents_match(l, r):
422	return 0
423	elif nodeType == TEXT:
424	if l.data != r.data:
425	return 0
426	else:
427	# not quite right, but good enough
428	return 0
429	return 1
430
431
432	def create_module_info(doc, section):
433	# Heavy.
434	node = extract_first_element(section, "modulesynopsis")
435	if node is None:
436	return
437	set_tagName(node, "synopsis")
438	lastchild = node.childNodes[-1]
439	if lastchild.nodeType == TEXT \
440	and lastchild.data[-1:] == ".":
441	lastchild.data = lastchild.data[:-1]
442	modauthor = extract_first_element(section, "moduleauthor")
443	if modauthor:
444	set_tagName(modauthor, "author")
445	modauthor.appendChild(doc.createTextNode(
446	modauthor.getAttribute("name")))
447	modauthor.removeAttribute("name")
448	platform = extract_first_element(section, "platform")
449	if section.tagName == "section":
450	modinfo_pos = 2
451	modinfo = doc.createElement("moduleinfo")
452	moddecl = extract_first_element(section, "declaremodule")
453	name = None
454	if moddecl:
455	modinfo.appendChild(doc.createTextNode("\n "))
456	name = moddecl.attributes["name"].value
457	namenode = doc.createElement("name")
458	namenode.appendChild(doc.createTextNode(name))
459	modinfo.appendChild(namenode)
460	type = moddecl.attributes.get("type")
461	if type:
462	type = type.value
463	modinfo.appendChild(doc.createTextNode("\n "))
464	typenode = doc.createElement("type")
465	typenode.appendChild(doc.createTextNode(type))
466	modinfo.appendChild(typenode)
467	versionadded = extract_first_element(section, "versionadded")
468	if versionadded:
469	modinfo.setAttribute("added", versionadded.getAttribute("version"))
470	title = get_first_element(section, "title")
471	if title:
472	children = title.childNodes
473	if len(children) >= 2 \
474	and children[0].nodeName == "module" \
475	and children[0].childNodes[0].data == name:
476	# this is it; morph the <title> into <short-synopsis>
477	first_data = children[1]
478	if first_data.data[:4] == " ---":
479	first_data.data = first_data.data[4:].lstrip()
480	set_tagName(title, "short-synopsis")
481	if children[-1].nodeType == TEXT \
482	and children[-1].data[-1:] == ".":
483	children[-1].data = children[-1].data[:-1]
484	section.removeChild(title)
485	section.removeChild(section.childNodes[0])
486	title.removeChild(children[0])
487	modinfo_pos = 0
488	else:
489	ewrite("module name in title doesn't match"
490	" <declaremodule/>; no <short-synopsis/>\n")
491	else:
492	ewrite("Unexpected condition: <section/> without <title/>\n")
493	modinfo.appendChild(doc.createTextNode("\n "))
494	modinfo.appendChild(node)
495	if title and not contents_match(title, node):
496	# The short synopsis is actually different,
497	# and needs to be stored:
498	modinfo.appendChild(doc.createTextNode("\n "))
499	modinfo.appendChild(title)
500	if modauthor:
501	modinfo.appendChild(doc.createTextNode("\n "))
502	modinfo.appendChild(modauthor)
503	if platform:
504	modinfo.appendChild(doc.createTextNode("\n "))
505	modinfo.appendChild(platform)
506	modinfo.appendChild(doc.createTextNode("\n "))
507	section.insertBefore(modinfo, section.childNodes[modinfo_pos])
508	section.insertBefore(doc.createTextNode("\n "), modinfo)
509	#
510	# The rest of this removes extra newlines from where we cut out
511	# a lot of elements. A lot of code for minimal value, but keeps
512	# keeps the generated *ML from being too funny looking.
513	#
514	section.normalize()
515	children = section.childNodes
516	for i in range(len(children)):
517	node = children[i]
518	if node.nodeName == "moduleinfo":
519	nextnode = children[i+1]
520	if nextnode.nodeType == TEXT:
521	data = nextnode.data
522	s = data.lstrip()
523	if len(s) < (len(data) - 4):
524	nextnode.data = "\n\n\n" + s
525
526
527	def cleanup_synopses(doc, fragment):
528	for node in find_all_elements(fragment, "section"):
529	create_module_info(doc, node)
530
531
532	def fixup_table_structures(doc, fragment):
533	for table in find_all_elements(fragment, "table"):
534	fixup_table(doc, table)
535
536
537	def fixup_table(doc, table):
538	# create the table head
539	thead = doc.createElement("thead")
540	row = doc.createElement("row")
541	move_elements_by_name(doc, table, row, "entry")
542	thead.appendChild(doc.createTextNode("\n "))
543	thead.appendChild(row)
544	thead.appendChild(doc.createTextNode("\n "))
545	# create the table body
546	tbody = doc.createElement("tbody")
547	prev_row = None
548	last_was_hline = 0
549	children = table.childNodes
550	for child in children:
551	if child.nodeType == ELEMENT:
552	tagName = child.tagName
553	if tagName == "hline" and prev_row is not None:
554	prev_row.setAttribute("rowsep", "1")
555	elif tagName == "row":
556	prev_row = child
557	# save the rows:
558	tbody.appendChild(doc.createTextNode("\n "))
559	move_elements_by_name(doc, table, tbody, "row", sep="\n ")
560	# and toss the rest:
561	while children:
562	child = children[0]
563	nodeType = child.nodeType
564	if nodeType == TEXT:
565	if child.data.strip():
566	raise ConversionError("unexpected free data in <%s>: %r"
567	% (table.tagName, child.data))
568	table.removeChild(child)
569	continue
570	if nodeType == ELEMENT:
571	if child.tagName != "hline":
572	raise ConversionError(
573	"unexpected <%s> in table" % child.tagName)
574	table.removeChild(child)
575	continue
576	raise ConversionError(
577	"unexpected %s node in table" % child.__class__.__name__)
578	# nothing left in the <table>; add the <thead> and <tbody>
579	tgroup = doc.createElement("tgroup")
580	tgroup.appendChild(doc.createTextNode("\n "))
581	tgroup.appendChild(thead)
582	tgroup.appendChild(doc.createTextNode("\n "))
583	tgroup.appendChild(tbody)
584	tgroup.appendChild(doc.createTextNode("\n "))
585	table.appendChild(tgroup)
586	# now make the <entry>s look nice:
587	for row in table.getElementsByTagName("row"):
588	fixup_row(doc, row)
589
590
591	def fixup_row(doc, row):
592	entries = []
593	map(entries.append, row.childNodes[1:])
594	for entry in entries:
595	row.insertBefore(doc.createTextNode("\n "), entry)
596	# row.appendChild(doc.createTextNode("\n "))
597
598
599	def move_elements_by_name(doc, source, dest, name, sep=None):
600	nodes = []
601	for child in source.childNodes:
602	if child.nodeName == name:
603	nodes.append(child)
604	for node in nodes:
605	source.removeChild(node)
606	dest.appendChild(node)
607	if sep:
608	dest.appendChild(doc.createTextNode(sep))
609
610
611	RECURSE_INTO_PARA_CONTAINERS = (
612	"chapter", "abstract", "enumerate",
613	"section", "subsection", "subsubsection",
614	"paragraph", "subparagraph", "back-matter",
615	"howto", "manual",
616	"item", "itemize", "fulllineitems", "enumeration", "descriptionlist",
617	"definitionlist", "definition",
618	)
619
620	PARA_LEVEL_ELEMENTS = (
621	"moduleinfo", "title", "verbatim", "enumerate", "item",
622	"interpreter-session", "back-matter", "interactive-session",
623	"opcodedesc", "classdesc", "datadesc",
624	"cfuncdesc", "ctypedesc", "cvardesc",
625	"funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni",
626	"funcdescni", "methoddescni", "excdescni",
627	"tableii", "tableiii", "tableiv", "localmoduletable",
628	"sectionauthor", "seealso", "itemize",
629	# include <para>, so we can just do it again to get subsequent paras:
630	PARA_ELEMENT,
631	)
632
633	PARA_LEVEL_PRECEEDERS = (
634	"setindexsubitem", "author",
635	"stindex", "obindex", "COMMENT", "label", "xi:include", "title",
636	"versionadded", "versionchanged", "declaremodule", "modulesynopsis",
637	"moduleauthor", "indexterm", "leader",
638	)
639
640
641	def fixup_paras(doc, fragment):
642	for child in fragment.childNodes:
643	if child.nodeName in RECURSE_INTO_PARA_CONTAINERS:
644	fixup_paras_helper(doc, child)
645	descriptions = find_all_elements(fragment, "description")
646	for description in descriptions:
647	fixup_paras_helper(doc, description)
648
649
650	def fixup_paras_helper(doc, container, depth=0):
651	# document is already normalized
652	children = container.childNodes
653	start = skip_leading_nodes(children)
654	while len(children) > start:
655	if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS:
656	# Something to recurse into:
657	fixup_paras_helper(doc, children[start])
658	else:
659	# Paragraph material:
660	build_para(doc, container, start, len(children))
661	if DEBUG_PARA_FIXER and depth == 10:
662	sys.exit(1)
663	start = skip_leading_nodes(children, start + 1)
664
665
666	def build_para(doc, parent, start, i):
667	children = parent.childNodes
668	after = start + 1
669	have_last = 0
670	BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS
671	# Collect all children until \n\n+ is found in a text node or a
672	# member of BREAK_ELEMENTS is found.
673	for j in range(start, i):
674	after = j + 1
675	child = children[j]
676	nodeType = child.nodeType
677	if nodeType == ELEMENT:
678	if child.tagName in BREAK_ELEMENTS:
679	after = j
680	break
681	elif nodeType == TEXT:
682	pos = child.data.find("\n\n")
683	if pos == 0:
684	after = j
685	break
686	if pos >= 1:
687	child.splitText(pos)
688	break
689	else:
690	have_last = 1
691	if (start + 1) > after:
692	raise ConversionError(
693	"build_para() could not identify content to turn into a paragraph")
694	if children[after - 1].nodeType == TEXT:
695	# we may need to split off trailing white space:
696	child = children[after - 1]
697	data = child.data
698	if data.rstrip() != data:
699	have_last = 0
700	child.splitText(len(data.rstrip()))
701	para = doc.createElement(PARA_ELEMENT)
702	prev = None
703	indexes = range(start, after)
704	indexes.reverse()
705	for j in indexes:
706	node = parent.childNodes[j]
707	parent.removeChild(node)
708	para.insertBefore(node, prev)
709	prev = node
710	if have_last:
711	parent.appendChild(para)
712	parent.appendChild(doc.createTextNode("\n\n"))
713	return len(parent.childNodes)
714	else:
715	nextnode = parent.childNodes[start]
716	if nextnode.nodeType == TEXT:
717	if nextnode.data and nextnode.data[0] != "\n":
718	nextnode.data = "\n" + nextnode.data
719	else:
720	newnode = doc.createTextNode("\n")
721	parent.insertBefore(newnode, nextnode)
722	nextnode = newnode
723	start = start + 1
724	parent.insertBefore(para, nextnode)
725	return start + 1
726
727
728	def skip_leading_nodes(children, start=0):
729	"""Return index into children of a node at which paragraph building should
730	begin or a recursive call to fixup_paras_helper() should be made (for
731	subsections, etc.).
732
733	When the return value >= len(children), we've built all the paras we can
734	from this list of children.
735	"""
736	i = len(children)
737	while i > start:
738	# skip over leading comments and whitespace:
739	child = children[start]
740	nodeType = child.nodeType
741	if nodeType == TEXT:
742	data = child.data
743	shortened = data.lstrip()
744	if shortened:
745	if data != shortened:
746	# break into two nodes: whitespace and non-whitespace
747	child.splitText(len(data) - len(shortened))
748	return start + 1
749	return start
750	# all whitespace, just skip
751	elif nodeType == ELEMENT:
752	tagName = child.tagName
753	if tagName in RECURSE_INTO_PARA_CONTAINERS:
754	return start
755	if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
756	return start
757	start = start + 1
758	return start
759
760
761	def fixup_rfc_references(doc, fragment):
762	for rfcnode in find_all_elements_from_set(fragment, ("pep", "rfc")):
763	rfcnode.appendChild(doc.createTextNode(
764	rfcnode.tagName.upper() + " " + rfcnode.getAttribute("num")))
765
766
767	def fixup_signatures(doc, fragment):
768	for child in fragment.childNodes:
769	if child.nodeType == ELEMENT:
770	args = child.getElementsByTagName("args")
771	for arg in args:
772	rewrite_args(doc, arg)
773	args = child.getElementsByTagName("constructor-args")
774	for arg in args:
775	rewrite_args(doc, arg)
776
777	def rewrite_args(doc, arglist):
778	fixup_args(doc, arglist)
779	arglist.normalize()
780	if arglist.childNodes.length == 1 and arglist.firstChild.nodeType == TEXT:
781	node = arglist.firstChild
782	node.data = ' '.join(node.data.split())
783
784	def fixup_args(doc, arglist):
785	for child in arglist.childNodes:
786	if child.nodeName == "optional":
787	# found it; fix and return
788	arglist.insertBefore(doc.createTextNode("["), child)
789	optkids = child.childNodes
790	while optkids:
791	arglist.insertBefore(child.firstChild, child)
792	arglist.insertBefore(doc.createTextNode("]"), child)
793	arglist.removeChild(child)
794	return fixup_args(doc, arglist)
795
796
797	def fixup_sectionauthors(doc, fragment):
798	for sectauth in find_all_elements(fragment, "sectionauthor"):
799	section = sectauth.parentNode
800	section.removeChild(sectauth)
801	set_tagName(sectauth, "author")
802	sectauth.appendChild(doc.createTextNode(
803	sectauth.getAttribute("name")))
804	sectauth.removeAttribute("name")
805	after = section.childNodes[2]
806	title = section.childNodes[1]
807	if title.nodeName != "title":
808	after = section.childNodes[0]
809	section.insertBefore(doc.createTextNode("\n "), after)
810	section.insertBefore(sectauth, after)
811
812
813	def fixup_verbatims(doc):
814	for verbatim in find_all_elements(doc, "verbatim"):
815	child = verbatim.childNodes[0]
816	if child.nodeType == TEXT \
817	and child.data.lstrip().startswith(">>>"):
818	set_tagName(verbatim, "interactive-session")
819
820
821	def add_node_ids(fragment, counter=0):
822	fragment.node_id = counter
823	for node in fragment.childNodes:
824	counter = counter + 1
825	if node.nodeType == ELEMENT:
826	counter = add_node_ids(node, counter)
827	else:
828	node.node_id = counter
829	return counter + 1
830
831
832	def fixup_ulink(doc, fragment):
833	for ulink in find_all_elements(fragment, "ulink"):
834	children = ulink.childNodes
835	assert len(children) == 2
836	text = children[0]
837	href = children[1]
838	href.normalize()
839	assert len(href.childNodes) == 1
840	assert href.childNodes[0].nodeType == TEXT
841	url = href.childNodes[0].data
842	ulink.setAttribute("href", url)
843	ulink.removeChild(href)
844	content = text.childNodes
845	while len(content):
846	ulink.appendChild(content[0])
847	ulink.removeChild(text)
848
849
850	REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex',
851	'refexmodindex', 'refstmodindex')
852
853	def fixup_refmodindexes(fragment):
854	# Locate <ref*modindex>...</> co-located with <module>...</>, and
855	# remove the <ref*modindex>, replacing it with index=index on the
856	# <module> element.
857	nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS)
858	d = {}
859	for node in nodes:
860	parent = node.parentNode
861	d[parent.node_id] = parent
862	del nodes
863	map(fixup_refmodindexes_chunk, d.values())
864
865
866	def fixup_refmodindexes_chunk(container):
867	# node is probably a <para>; let's see how often it isn't:
868	if container.tagName != PARA_ELEMENT:
869	bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container)
870	module_entries = find_all_elements(container, "module")
871	if not module_entries:
872	return
873	index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS)
874	removes = []
875	for entry in index_entries:
876	children = entry.childNodes
877	if len(children) != 0:
878	bwrite("--- unexpected number of children for %s node:\n"
879	% entry.tagName)
880	ewrite(entry.toxml() + "\n")
881	continue
882	found = 0
883	module_name = entry.getAttribute("module")
884	for node in module_entries:
885	if len(node.childNodes) != 1:
886	continue
887	this_name = node.childNodes[0].data
888	if this_name == module_name:
889	found = 1
890	node.setAttribute("index", "yes")
891	if found:
892	removes.append(entry)
893	for node in removes:
894	container.removeChild(node)
895
896
897	def fixup_bifuncindexes(fragment):
898	nodes = find_all_elements(fragment, 'bifuncindex')
899	d = {}
900	# make sure that each parent is only processed once:
901	for node in nodes:
902	parent = node.parentNode
903	d[parent.node_id] = parent
904	del nodes
905	map(fixup_bifuncindexes_chunk, d.values())
906
907
908	def fixup_bifuncindexes_chunk(container):
909	removes = []
910	entries = find_all_child_elements(container, "bifuncindex")
911	function_entries = find_all_child_elements(container, "function")
912	for entry in entries:
913	function_name = entry.getAttribute("name")
914	found = 0
915	for func_entry in function_entries:
916	t2 = func_entry.childNodes[0].data
917	if t2[-2:] != "()":
918	continue
919	t2 = t2[:-2]
920	if t2 == function_name:
921	func_entry.setAttribute("index", "yes")
922	func_entry.setAttribute("module", "__builtin__")
923	if not found:
924	found = 1
925	removes.append(entry)
926	for entry in removes:
927	container.removeChild(entry)
928
929
930	def join_adjacent_elements(container, gi):
931	queue = [container]
932	while queue:
933	parent = queue.pop()
934	i = 0
935	children = parent.childNodes
936	nchildren = len(children)
937	while i < (nchildren - 1):
938	child = children[i]
939	if child.nodeName == gi:
940	if children[i+1].nodeName == gi:
941	ewrite("--- merging two <%s/> elements\n" % gi)
942	child = children[i]
943	nextchild = children[i+1]
944	nextchildren = nextchild.childNodes
945	while len(nextchildren):
946	node = nextchildren[0]
947	nextchild.removeChild(node)
948	child.appendChild(node)
949	parent.removeChild(nextchild)
950	continue
951	if child.nodeType == ELEMENT:
952	queue.append(child)
953	i = i + 1
954
955
956	_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
957
958	def write_esis(doc, ofp, knownempty):
959	for node in doc.childNodes:
960	nodeType = node.nodeType
961	if nodeType == ELEMENT:
962	gi = node.tagName
963	if knownempty(gi):
964	if node.hasChildNodes():
965	raise ValueError, \
966	"declared-empty node <%s> has children" % gi
967	ofp.write("e\n")
968	for k, value in node.attributes.items():
969	if _token_rx.match(value):
970	dtype = "TOKEN"
971	else:
972	dtype = "CDATA"
973	ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
974	ofp.write("(%s\n" % gi)
975	write_esis(node, ofp, knownempty)
976	ofp.write(")%s\n" % gi)
977	elif nodeType == TEXT:
978	ofp.write("-%s\n" % esistools.encode(node.data))
979	elif nodeType == ENTITY_REFERENCE:
980	ofp.write("&%s\n" % node.nodeName)
981	else:
982	raise RuntimeError, "unsupported node type: %s" % nodeType
983
984
985	def convert(ifp, ofp):
986	events = esistools.parse(ifp)
987	toktype, doc = events.getEvent()
988	fragment = doc.createDocumentFragment()
989	events.expandNode(fragment)
990
991	normalize(fragment)
992	simplify(doc, fragment)
993	handle_labels(doc, fragment)
994	handle_appendix(doc, fragment)
995	fixup_trailing_whitespace(doc, fragment, {
996	# element -> (before-end-tag, after-end-tag)
997	"abstract": ("\n", "\n"),
998	"title": ("", "\n"),
999	"chapter": ("\n", "\n\n\n"),
1000	"section": ("\n", "\n\n\n"),
1001	"subsection": ("\n", "\n\n"),
1002	"subsubsection": ("\n", "\n\n"),
1003	"paragraph": ("\n", "\n\n"),
1004	"subparagraph": ("\n", "\n\n"),
1005	"description": ("\n", "\n\n"),
1006	"enumeration": ("\n", "\n\n"),
1007	"item": ("\n", "\n\n"),
1008	})
1009	cleanup_root_text(doc)
1010	cleanup_trailing_parens(fragment, ["function", "method", "cfunction"])
1011	cleanup_synopses(doc, fragment)
1012	fixup_descriptors(doc, fragment)
1013	fixup_verbatims(fragment)
1014	normalize(fragment)
1015	fixup_paras(doc, fragment)
1016	fixup_sectionauthors(doc, fragment)
1017	fixup_table_structures(doc, fragment)
1018	fixup_rfc_references(doc, fragment)
1019	fixup_signatures(doc, fragment)
1020	fixup_ulink(doc, fragment)
1021	add_node_ids(fragment)
1022	fixup_refmodindexes(fragment)
1023	fixup_bifuncindexes(fragment)
1024	# Take care of ugly hacks in the LaTeX markup to avoid LaTeX and
1025	# LaTeX2HTML screwing with GNU-style long options (the '--' problem).
1026	join_adjacent_elements(fragment, "option")
1027	# Attempt to avoid trailing blank lines:
1028	fragment.normalize()
1029	if fragment.lastChild.data[-1:] == "\n":
1030	fragment.lastChild.data = fragment.lastChild.data.rstrip() + "\n"
1031	#
1032	d = {}
1033	for gi in events.parser.get_empties():
1034	d[gi] = gi
1035	for key in ("author", "pep", "rfc"):
1036	if d.has_key(key):
1037	del d[key]
1038	knownempty = d.has_key
1039	#
1040	try:
1041	write_esis(fragment, ofp, knownempty)
1042	except IOError, (err, msg):
1043	# Ignore EPIPE; it just means that whoever we're writing to stopped
1044	# reading. The rest of the output would be ignored. All other errors
1045	# should still be reported,
1046	if err != errno.EPIPE:
1047	raise
1048
1049
1050	def main():
1051	if len(sys.argv) == 1:
1052	ifp = sys.stdin
1053	ofp = sys.stdout
1054	elif len(sys.argv) == 2:
1055	ifp = open(sys.argv[1])
1056	ofp = sys.stdout
1057	elif len(sys.argv) == 3:
1058	ifp = open(sys.argv[1])
1059	import StringIO
1060	ofp = StringIO.StringIO()
1061	else:
1062	usage()
1063	sys.exit(2)
1064	convert(ifp, ofp)
1065	if len(sys.argv) == 3:
1066	fp = open(sys.argv[2], "w")
1067	fp.write(ofp.getvalue())
1068	fp.close()
1069	ofp.close()
1070
1071
1072	if __name__ == "__main__":
1073	main()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Doc/tools/sgmlconv/docfixer.py

Download in other formats: