Context Navigation

gencodec.py

Visit:

Last change on this file was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 11.8 KB

Line
1	""" Unicode Mapping Parser and Codec Generator.
2
3	This script parses Unicode mapping files as available from the Unicode
4	site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5	modules from them. The codecs use the standard character mapping codec
6	to actually apply the mapping.
7
8	Synopsis: gencodec.py dir codec_prefix
9
10	All files in dir are scanned and those producing non-empty mappings
11	will be written to <codec_prefix><mapname>.py with <mapname> being the
12	first part of the map's filename ('a' in a.b.c.txt) converted to
13	lowercase with hyphens replaced by underscores.
14
15	The tool also writes marshalled versions of the mapping tables to the
16	same location (with .mapping extension).
17
18	Written by Marc-Andre Lemburg ([email protected]).
19
20	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21	(c) Copyright Guido van Rossum, 2000.
22
23	Table generation:
24	(c) Copyright Marc-Andre Lemburg, 2005.
25	Licensed to PSF under a Contributor Agreement.
26
27	"""#"
28
29	import re, os, time, marshal, codecs
30
31	# Maximum allowed size of charmap tables
32	MAX_TABLE_SIZE = 8192
33
34	# Standard undefined Unicode code point
35	UNI_UNDEFINED = unichr(0xFFFE)
36
37	mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38	'\s+'
39	'((?:(?:0x[0-9a-fA-Z]+\|<[A-Za-z]+>)\+?)*)'
40	'\s*'
41	'(#.+)?')
42
43	def parsecodes(codes,
44	len=len, filter=filter,range=range):
45
46	""" Converts code combinations to either a single code integer
47	or a tuple of integers.
48
49	meta-codes (in angular brackets, e.g. <LR> and <RL>) are
50	ignored.
51
52	Empty codes or illegal ones are returned as None.
53
54	"""
55	if not codes:
56	return None
57	l = codes.split('+')
58	if len(l) == 1:
59	return int(l[0],16)
60	for i in range(len(l)):
61	try:
62	l[i] = int(l[i],16)
63	except ValueError:
64	l[i] = None
65	l = filter(lambda x: x is not None, l)
66	if len(l) == 1:
67	return l[0]
68	else:
69	return tuple(l)
70
71	def readmap(filename):
72
73	f = open(filename,'r')
74	lines = f.readlines()
75	f.close()
76	enc2uni = {}
77	identity = []
78	unmapped = range(256)
79
80	# UTC mapping tables per convention don't include the identity
81	# mappings for code points 0x00 - 0x1F and 0x7F, unless these are
82	# explicitly mapped to different characters or undefined
83	for i in range(32) + [127]:
84	identity.append(i)
85	unmapped.remove(i)
86	enc2uni[i] = (i, 'CONTROL CHARACTER')
87
88	for line in lines:
89	line = line.strip()
90	if not line or line[0] == '#':
91	continue
92	m = mapRE.match(line)
93	if not m:
94	#print '* not matched: %s' % repr(line)
95	continue
96	enc,uni,comment = m.groups()
97	enc = parsecodes(enc)
98	uni = parsecodes(uni)
99	if comment is None:
100	comment = ''
101	else:
102	comment = comment[1:].strip()
103	if enc < 256:
104	if enc in unmapped:
105	unmapped.remove(enc)
106	if enc == uni:
107	identity.append(enc)
108	enc2uni[enc] = (uni,comment)
109	else:
110	enc2uni[enc] = (uni,comment)
111
112	# If there are more identity-mapped entries than unmapped entries,
113	# it pays to generate an identity dictionary first, and add explicit
114	# mappings to None for the rest
115	if len(identity) >= len(unmapped):
116	for enc in unmapped:
117	enc2uni[enc] = (None, "")
118	enc2uni['IDENTITY'] = 256
119
120	return enc2uni
121
122	def hexrepr(t, precision=4):
123
124	if t is None:
125	return 'None'
126	try:
127	len(t)
128	except:
129	return '0x%0*X' % (precision, t)
130	try:
131	return '(' + ', '.join(['0x%0*X' % (precision, item)
132	for item in t]) + ')'
133	except TypeError, why:
134	print '* failed to convert %r: %s' % (t, why)
135	raise
136
137	def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
138
139	l = []
140	append = l.append
141	if map.has_key("IDENTITY"):
142	append("%s = codecs.make_identity_dict(range(%d))" %
143	(varname, map["IDENTITY"]))
144	append("%s.update({" % varname)
145	splits = 1
146	del map["IDENTITY"]
147	identity = 1
148	else:
149	append("%s = {" % varname)
150	splits = 0
151	identity = 0
152
153	mappings = map.items()
154	mappings.sort()
155	i = 0
156	key_precision, value_precision = precisions
157	for mapkey, mapvalue in mappings:
158	mapcomment = ''
159	if isinstance(mapkey, tuple):
160	(mapkey, mapcomment) = mapkey
161	if isinstance(mapvalue, tuple):
162	(mapvalue, mapcomment) = mapvalue
163	if mapkey is None:
164	continue
165	if (identity and
166	mapkey == mapvalue and
167	mapkey < 256):
168	# No need to include identity mappings, since these
169	# are already set for the first 256 code points.
170	continue
171	key = hexrepr(mapkey, key_precision)
172	value = hexrepr(mapvalue, value_precision)
173	if mapcomment and comments:
174	append(' %s: %s,\t# %s' % (key, value, mapcomment))
175	else:
176	append(' %s: %s,' % (key, value))
177	i += 1
178	if i == 4096:
179	# Split the definition into parts to that the Python
180	# parser doesn't dump core
181	if splits == 0:
182	append('}')
183	else:
184	append('})')
185	append('%s.update({' % varname)
186	i = 0
187	splits = splits + 1
188	if splits == 0:
189	append('}')
190	else:
191	append('})')
192
193	return l
194
195	def python_tabledef_code(varname, map, comments=1, key_precision=2):
196
197	l = []
198	append = l.append
199	append('%s = (' % varname)
200
201	# Analyze map and create table dict
202	mappings = map.items()
203	mappings.sort()
204	table = {}
205	maxkey = 0
206	if map.has_key('IDENTITY'):
207	for key in range(256):
208	table[key] = (key, '')
209	maxkey = 255
210	del map['IDENTITY']
211	for mapkey, mapvalue in mappings:
212	mapcomment = ''
213	if isinstance(mapkey, tuple):
214	(mapkey, mapcomment) = mapkey
215	if isinstance(mapvalue, tuple):
216	(mapvalue, mapcomment) = mapvalue
217	if mapkey is None:
218	continue
219	table[mapkey] = (mapvalue, mapcomment)
220	if mapkey > maxkey: