source: trunk/essentials/dev-lang/python/Doc/tools/sgmlconv/latex2esis.py@ 3226

Last change on this file since 3226 was 3225, checked in by bird, 19 years ago

Python 2.5

File size: 19.5 KB
Line 
1#! /usr/bin/env python
2
3"""Generate ESIS events based on a LaTeX source document and
4configuration data.
5
6The conversion is not strong enough to work with arbitrary LaTeX
7documents; it has only been designed to work with the highly stylized
8markup used in the standard Python documentation. A lot of
9information about specific markup is encoded in the control table
10passed to the convert() function; changing this table can allow this
11tool to support additional LaTeX markups.
12
13The format of the table is largely undocumented; see the commented
14headers where the table is specified in main(). There is no provision
15to load an alternate table from an external file.
16"""
17
18import errno
19import getopt
20import os
21import re
22import sys
23import xml.sax
24import xml.sax.saxutils
25
26from esistools import encode
27
28
29DEBUG = 0
30
31
32class LaTeXFormatError(Exception):
33 pass
34
35
36class LaTeXStackError(LaTeXFormatError):
37 def __init__(self, found, stack):
38 msg = "environment close for %s doesn't match;\n stack = %s" \
39 % (found, stack)
40 self.found = found
41 self.stack = stack[:]
42 LaTeXFormatError.__init__(self, msg)
43
44
45
46_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
47_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
48_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
49_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
50_text_rx = re.compile(r"[^]~%\\{}]+")
51_optional_rx = re.compile(r"\s*[[]([^]]*)[]]", re.MULTILINE)
52# _parameter_rx is this complicated to allow {...} inside a parameter;
53# this is useful to match tabular layout specifications like {c|p{24pt}}
54_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
55_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
56_start_group_rx = re.compile("[ \n]*{")
57_start_optional_rx = re.compile("[ \n]*[[]")
58
59
60ESCAPED_CHARS = "$%#^ {}&~"
61
62
63def dbgmsg(msg):
64 if DEBUG:
65 sys.stderr.write(msg + "\n")
66
67def pushing(name, point, depth):
68 dbgmsg("pushing <%s> at %s" % (name, point))
69
70def popping(name, point, depth):
71 dbgmsg("popping </%s> at %s" % (name, point))
72
73
74class _Stack(list):
75 def append(self, entry):
76 if not isinstance(entry, str):
77 raise LaTeXFormatError("cannot push non-string on stack: %r"
78 % (entry, ))
79 #dbgmsg("%s<%s>" % (" "*len(self.data), entry))
80 list.append(self, entry)
81
82 def pop(self, index=-1):
83 entry = self[index]
84 del self[index]
85 #dbgmsg("%s</%s>" % (" " * len(self), entry))
86
87 def __delitem__(self, index):
88 entry = self[index]
89 list.__delitem__(self, index)
90 #dbgmsg("%s</%s>" % (" " * len(self), entry))
91
92
93def new_stack():
94 if DEBUG:
95 return _Stack()
96 else:
97 return []
98
99
100
101class Conversion:
102 def __init__(self, ifp, ofp, table):
103 self.write = ofp.write
104 self.ofp = ofp
105 self.table = table
106 L = [s.rstrip() for s in ifp.readlines()]
107 L.append("")
108 self.line = "\n".join(L)
109 self.preamble = 1
110
111 def convert(self):
112 self.subconvert()
113
114 def subconvert(self, endchar=None, depth=0):
115 #
116 # Parses content, including sub-structures, until the character
117 # 'endchar' is found (with no open structures), or until the end
118 # of the input data is endchar is None.
119 #
120 stack = new_stack()
121 line = self.line
122 while line:
123 if line[0] == endchar and not stack:
124 self.line = line
125 return line
126 m = _comment_rx.match(line)
127 if m:
128 text = m.group(1)
129 if text:
130 self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
131 % encode(text))
132 line = line[m.end():]
133 continue
134 m = _begin_env_rx.match(line)
135 if m:
136 name = m.group(1)
137 entry = self.get_env_entry(name)
138 # re-write to use the macro handler
139 line = r"\%s %s" % (name, line[m.end():])
140 continue
141 m = _end_env_rx.match(line)
142 if m:
143 # end of environment
144 envname = m.group(1)
145 entry = self.get_entry(envname)
146 while stack and envname != stack[-1] \
147 and stack[-1] in entry.endcloses:
148 self.write(")%s\n" % stack.pop())
149 if stack and envname == stack[-1]:
150 self.write(")%s\n" % entry.outputname)
151 del stack[-1]
152 else:
153 raise LaTeXStackError(envname, stack)
154 line = line[m.end():]
155 continue
156 m = _begin_macro_rx.match(line)
157 if m:
158 # start of macro
159 macroname = m.group(1)
160 if macroname == "c":
161 # Ugh! This is a combining character...
162 endpos = m.end()
163 self.combining_char("c", line[endpos])
164 line = line[endpos + 1:]
165 continue
166 entry = self.get_entry(macroname)
167 if entry.verbatim:
168 # magic case!
169 pos = line.find("\\end{%s}" % macroname)
170 text = line[m.end(1):pos]
171 stack.append(entry.name)
172 self.write("(%s\n" % entry.outputname)
173 self.write("-%s\n" % encode(text))
174 self.write(")%s\n" % entry.outputname)
175 stack.pop()
176 line = line[pos + len("\\end{%s}" % macroname):]
177 continue
178 while stack and stack[-1] in entry.closes:
179 top = stack.pop()
180 topentry = self.get_entry(top)
181 if topentry.outputname:
182 self.write(")%s\n-\\n\n" % topentry.outputname)
183 #
184 if entry.outputname and entry.empty:
185 self.write("e\n")
186 #
187 params, optional, empty = self.start_macro(macroname)
188 # rip off the macroname
189 if params:
190 line = line[m.end(1):]
191 elif empty:
192 line = line[m.end(1):]
193 else:
194 line = line[m.end():]
195 opened = 0
196 implied_content = 0
197
198 # handle attribute mappings here:
199 for pentry in params:
200 if pentry.type == "attribute":
201 if pentry.optional:
202 m = _optional_rx.match(line)
203 if m and entry.outputname:
204 line = line[m.end():]
205 self.dump_attr(pentry, m.group(1))
206 elif pentry.text and entry.outputname:
207 # value supplied by conversion spec:
208 self.dump_attr(pentry, pentry.text)
209 else:
210 m = _parameter_rx.match(line)
211 if not m:
212 raise LaTeXFormatError(
213 "could not extract parameter %s for %s: %r"
214 % (pentry.name, macroname, line[:100]))
215 if entry.outputname:
216 self.dump_attr(pentry, m.group(1))
217 line = line[m.end():]
218 elif pentry.type == "child":
219 if pentry.optional:
220 m = _optional_rx.match(line)
221 if m:
222 line = line[m.end():]
223 if entry.outputname and not opened:
224 opened = 1
225 self.write("(%s\n" % entry.outputname)
226 stack.append(macroname)
227 stack.append(pentry.name)
228 self.write("(%s\n" % pentry.name)
229 self.write("-%s\n" % encode(m.group(1)))
230 self.write(")%s\n" % pentry.name)
231 stack.pop()
232 else:
233 if entry.outputname and not opened:
234 opened = 1
235 self.write("(%s\n" % entry.outputname)
236 stack.append(entry.name)
237 self.write("(%s\n" % pentry.name)
238 stack.append(pentry.name)
239 self.line = skip_white(line)[1:]
240 line = self.subconvert(
241 "}", len(stack) + depth + 1)[1:]
242 self.write(")%s\n" % stack.pop())
243 elif pentry.type == "content":
244 if pentry.implied:
245 implied_content = 1
246 else:
247 if entry.outputname and not opened:
248 opened = 1
249 self.write("(%s\n" % entry.outputname)
250 stack.append(entry.name)
251 line = skip_white(line)
252 if line[0] != "{":
253 raise LaTeXFormatError(
254 "missing content for " + macroname)
255 self.line = line[1:]
256 line = self.subconvert("}", len(stack) + depth + 1)
257 if line and line[0] == "}":
258 line = line[1:]
259 elif pentry.type == "text" and pentry.text:
260 if entry.outputname and not opened:
261 opened = 1
262 stack.append(entry.name)
263 self.write("(%s\n" % entry.outputname)
264 #dbgmsg("--- text: %r" % pentry.text)
265 self.write("-%s\n" % encode(pentry.text))
266 elif pentry.type == "entityref":
267 self.write("&%s\n" % pentry.name)
268 if entry.outputname:
269 if not opened:
270 self.write("(%s\n" % entry.outputname)
271 stack.append(entry.name)
272 if not implied_content:
273 self.write(")%s\n" % entry.outputname)
274 stack.pop()
275 continue
276 if line[0] == endchar and not stack:
277 self.line = line[1:]
278 return self.line
279 if line[0] == "}":
280 # end of macro or group
281 macroname = stack[-1]
282 if macroname:
283 conversion = self.table[macroname]
284 if conversion.outputname:
285 # otherwise, it was just a bare group
286 self.write(")%s\n" % conversion.outputname)
287 del stack[-1]
288 line = line[1:]
289 continue
290 if line[0] == "~":
291 # don't worry about the "tie" aspect of this command
292 line = line[1:]
293 self.write("- \n")
294 continue
295 if line[0] == "{":
296 stack.append("")
297 line = line[1:]
298 continue
299 if line[0] == "\\" and line[1] in ESCAPED_CHARS:
300 self.write("-%s\n" % encode(line[1]))
301 line = line[2:]
302 continue
303 if line[:2] == r"\\":
304 self.write("(BREAK\n)BREAK\n")
305 line = line[2:]
306 continue
307 if line[:2] == r"\_":
308 line = "_" + line[2:]
309 continue
310 if line[:2] in (r"\'", r'\"'):
311 # combining characters...
312 self.combining_char(line[1], line[2])
313 line = line[3:]
314 continue
315 m = _text_rx.match(line)
316 if m:
317 text = encode(m.group())
318 self.write("-%s\n" % text)
319 line = line[m.end():]
320 continue
321 # special case because of \item[]
322 # XXX can we axe this???
323 if line[0] == "]":
324 self.write("-]\n")
325 line = line[1:]
326 continue
327 # avoid infinite loops
328 extra = ""
329 if len(line) > 100:
330 extra = "..."
331 raise LaTeXFormatError("could not identify markup: %r%s"
332 % (line[:100], extra))
333 while stack:
334 entry = self.get_entry(stack[-1])
335 if entry.closes:
336 self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
337 del stack[-1]
338 else:
339 break
340 if stack:
341 raise LaTeXFormatError("elements remain on stack: "
342 + ", ".join(stack))
343 # otherwise we just ran out of input here...
344
345 # This is a really limited table of combinations, but it will have
346 # to do for now.
347 _combinations = {
348 ("c", "c"): 0x00E7,
349 ("'", "e"): 0x00E9,
350 ('"', "o"): 0x00F6,
351 }
352
353 def combining_char(self, prefix, char):
354 ordinal = self._combinations[(prefix, char)]
355 self.write("-\\%%%d;\n" % ordinal)
356
357 def start_macro(self, name):
358 conversion = self.get_entry(name)
359 parameters = conversion.parameters
360 optional = parameters and parameters[0].optional
361 return parameters, optional, conversion.empty
362
363 def get_entry(self, name):
364 entry = self.table.get(name)
365 if entry is None:
366 dbgmsg("get_entry(%r) failing; building default entry!" % (name, ))
367 # not defined; build a default entry:
368 entry = TableEntry(name)
369 entry.has_content = 1
370 entry.parameters.append(Parameter("content"))
371 self.table[name] = entry
372 return entry
373
374 def get_env_entry(self, name):
375 entry = self.table.get(name)
376 if entry is None:
377 # not defined; build a default entry:
378 entry = TableEntry(name, 1)
379 entry.has_content = 1
380 entry.parameters.append(Parameter("content"))
381 entry.parameters[-1].implied = 1
382 self.table[name] = entry
383 elif not entry.environment:
384 raise LaTeXFormatError(
385 name + " is defined as a macro; expected environment")
386 return entry
387
388 def dump_attr(self, pentry, value):
389 if not (pentry.name and value):
390 return
391 if _token_rx.match(value):
392 dtype = "TOKEN"
393 else:
394 dtype = "CDATA"
395 self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
396
397
398def convert(ifp, ofp, table):
399 c = Conversion(ifp, ofp, table)
400 try:
401 c.convert()
402 except IOError, (err, msg):
403 if err != errno.EPIPE:
404 raise
405
406
407def skip_white(line):
408 while line and line[0] in " %\n\t\r":
409 line = line[1:].lstrip()
410 return line
411
412
413
414class TableEntry:
415 def __init__(self, name, environment=0):
416 self.name = name
417 self.outputname = name
418 self.environment = environment
419 self.empty = not environment
420 self.has_content = 0
421 self.verbatim = 0
422 self.auto_close = 0
423 self.parameters = []
424 self.closes = []
425 self.endcloses = []
426
427class Parameter:
428 def __init__(self, type, name=None, optional=0):
429 self.type = type
430 self.name = name
431 self.optional = optional
432 self.text = ''
433 self.implied = 0
434
435
436class TableHandler(xml.sax.handler.ContentHandler):
437 def __init__(self):
438 self.__table = {}
439 self.__buffer = ''
440 self.__methods = {}
441
442 def get_table(self):
443 for entry in self.__table.values():
444 if entry.environment and not entry.has_content:
445 p = Parameter("content")
446 p.implied = 1
447 entry.parameters.append(p)
448 entry.has_content = 1
449 return self.__table
450
451 def startElement(self, tag, attrs):
452 try:
453 start, end = self.__methods[tag]
454 except KeyError:
455 start = getattr(self, "start_" + tag, None)
456 end = getattr(self, "end_" + tag, None)
457 self.__methods[tag] = (start, end)
458 if start:
459 start(attrs)
460
461 def endElement(self, tag):
462 start, end = self.__methods[tag]
463 if end:
464 end()
465
466 def endDocument(self):
467 self.__methods.clear()
468
469 def characters(self, data):