Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

makeunicodedata.py@ 3310

Visit:

Last change on this file since 3310 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 30.6 KB

Line
1	#
2	# (re)generate unicode property and type databases
3	#
4	# this script converts a unicode 3.2 database file to
5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6	# and Objects/unicodetype_db.h
7	#
8	# history:
9	# 2000-09-24 fl created (based on bits and pieces from unidb)
10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11	# 2000-09-25 fl added character type table
12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13	# 2000-11-03 fl expand first/last ranges
14	# 2001-01-19 fl added character name tables (2.1)
15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16	# 2002-09-11 wd use string methods
17	# 2002-10-18 mvl update to Unicode 3.2
18	# 2002-10-22 mvl generate NFC tables
19	# 2002-11-24 mvl expand all ranges, sort names version-independently
20	# 2002-11-25 mvl add UNIDATA_VERSION
21	# 2004-05-29 perky add east asian width information
22	# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23	#
24	# written by Fredrik Lundh ([email protected])
25	#
26
27	import sys
28
29	SCRIPT = sys.argv[0]
30	VERSION = "2.5"
31
32	# The Unicode Database
33	UNIDATA_VERSION = "4.1.0"
34	UNICODE_DATA = "UnicodeData%s.txt"
35	COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36	EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38	old_versions = ["3.2.0"]
39
40	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43	"So" ]
44
45	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47	"ON" ]
48
49	EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
51	# note: should match definitions in Objects/unicodectype.c
52	ALPHA_MASK = 0x01
53	DECIMAL_MASK = 0x02
54	DIGIT_MASK = 0x04
55	LOWER_MASK = 0x08
56	LINEBREAK_MASK = 0x10
57	SPACE_MASK = 0x20
58	TITLE_MASK = 0x40
59	UPPER_MASK = 0x80
60
61	def maketables(trace=0):
62
63	print "--- Reading", UNICODE_DATA % "", "..."
64
65	version = ""
66	unicode = UnicodeData(UNICODE_DATA % version,
67	COMPOSITION_EXCLUSIONS % version,
68	EASTASIAN_WIDTH % version)
69
70	print len(filter(None, unicode.table)), "characters"
71
72	for version in old_versions:
73	print "--- Reading", UNICODE_DATA % ("-"+version), "..."
74	old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
75	COMPOSITION_EXCLUSIONS % ("-"+version),
76	EASTASIAN_WIDTH % ("-"+version))
77	print len(filter(None, old_unicode.table)), "characters"
78	merge_old_version(version, unicode, old_unicode)
79
80	makeunicodename(unicode, trace)
81	makeunicodedata(unicode, trace)
82	makeunicodetype(unicode, trace)
83
84	# --------------------------------------------------------------------
85	# unicode character properties
86
87	def makeunicodedata(unicode, trace):
88
89	dummy = (0, 0, 0, 0, 0)
90	table = [dummy]
91	cache = {0: dummy}
92	index = [0] * len(unicode.chars)
93
94	FILE = "Modules/unicodedata_db.h"
95
96	print "--- Preparing", FILE, "..."
97
98	# 1) database properties
99
100	for char in unicode.chars:
101	record = unicode.table[char]
102	if record:
103	# extract database properties
104	category = CATEGORY_NAMES.index(record[2])
105	combining = int(record[3])
106	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
107	mirrored = record[9] == "Y"
108	eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
109	item = (
110	category, combining, bidirectional, mirrored, eastasianwidth
111	)
112	# add entry to index and item tables
113	i = cache.get(item)
114	if i is None:
115	cache[item] = i = len(table)
116	table.append(item)
117	index[char] = i
118
119	# 2) decomposition data
120
121	decomp_data = [0]
122	decomp_prefix = [""]
123	decomp_index = [0] * len(unicode.chars)
124	decomp_size = 0
125
126	comp_pairs = []
127	comp_first = [None] * len(unicode.chars)
128	comp_last = [None] * len(unicode.chars)
129
130	for char in unicode.chars:
131	record = unicode.table[char]
132	if record:
133	if record[5]:
134	decomp = record[5].split()
135	if len(decomp) > 19:
136	raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
137	# prefix
138	if decomp[0][0] == "<":
139	prefix = decomp.pop(0)
140	else:
141	prefix = ""
142	try:
143	i = decomp_prefix.index(prefix)
144	except ValueError:
145	i = len(decomp_prefix)
146	decomp_prefix.append(prefix)
147	prefix = i
148	assert prefix < 256
149	# content
150	decomp = [prefix + (len(decomp)<<8)] +\
151	map(lambda s: int(s, 16), decomp)
152	# Collect NFC pairs
153	if not prefix and len(decomp) == 3 and \
154	char not in unicode.exclusions and \
155	unicode.table[decomp[1]][3] == "0":
156	p, l, r = decomp
157	comp_first[l] = 1
158	comp_last[r] = 1
159	comp_pairs.append((l,r,char))
160	try:
161	i = decomp_data.index(decomp)
162	except ValueError:
163	i = len(decomp_data)
164	decomp_data.extend(decomp)
165	decomp_size = decomp_size + len(decomp) * 2
166	else:
167	i = 0
168	decomp_index[char] = i
169
170	f = l = 0
171	comp_first_ranges = []
172	comp_last_ranges = []
173	prev_f = prev_l = None
174	for i in unicode.chars:
175	if comp_first[i] is not None:
176	comp_first[i] = f
177	f += 1
178	if prev_f is None:
179	prev_f = (i,i)
180	elif prev_f[1]+1 == i:
181	prev_f = prev_f[0],i
182	else:
183	comp_first_ranges.append(prev_f)
184	prev_f = (i,i)
185	if comp_last[i] is not None:
186	comp_last[i] = l
187	l += 1
188	if prev_l is None:
189	prev_l = (i,i)
190	elif prev_l[1]+1 == i:
191	prev_l = prev_l[0],i
192	else:
193	comp_last_ranges.append(prev_l)
194	prev_l = (i,i)
195	comp_first_ranges.append(prev_f)
196	comp_last_ranges.append(prev_l)
197	total_first = f
198	total_last = l
199
200	comp_data = [0](total_firsttotal_last)
201	for f,l,char in comp_pairs:
202	f = comp_first[f]
203	l = comp_last[l]
204	comp_data[f*total_last+l] = char
205
206	print len(table), "unique properties"
207	print len(decomp_prefix), "unique decomposition prefixes"
208	print len(decomp_data), "unique decomposition entries:",
209	print decomp_size, "bytes"
210	print total_first, "first characters in NFC"
211	print total_last, "last characters in NFC"
212	print len(comp_pairs), "NFC pairs"
213
214	print "--- Writing", FILE, "..."
215
216	fp = open(FILE, "w")
217	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
218	print >>fp
219	print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
220	print >>fp, "/* a list of unique database records */"
221	print >>fp, \
222	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
223	for item in table:
224	print >>fp, " {%d, %d, %d, %d, %d}," % item
225	print >>fp, "};"
226	print >>fp
227
228	print >>fp, "/* Reindexing of NFC first characters. */"
229	print >>fp, "#define TOTAL_FIRST",total_first
230	print >>fp, "#define TOTAL_LAST",total_last
231	print >>fp, "struct reindex{int start;short count,index;};"
232	print >>fp, "struct reindex nfc_first[] = {"
233	for start,end in comp_first_ranges:
234	print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
235	print >>fp," {0,0,0}"
236	print >>fp,"};\n"
237	print >>fp, "struct reindex nfc_last[] = {"
238	for start,end in comp_last_ranges:
239	print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
240	print >>fp," {0,0,0}"
241	print >>fp,"};\n"
242
243	# FIXME: <fl> the following tables could be made static, and
244	# the support code moved into unicodedatabase.c
245
246	print >>fp, "/* string literals */"
247	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
248	for name in CATEGORY_NAMES:
249	print >>fp, " \"%s\"," % name
250	print >>fp, " NULL"
251	print >>fp, "};"
252
253	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
254	for name in BIDIRECTIONAL_NAMES:
255	print >>fp, " \"%s\"," % name
256	print >>fp, " NULL"
257	print >>fp, "};"
258
259	print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
260	for name in EASTASIANWIDTH_NAMES:
261	print >>fp, " \"%s\"," % name
262	print >>fp, " NULL"
263	print >>fp, "};"
264
265	print >>fp, "static const char *decomp_prefix[] = {"
266	for name in decomp_prefix:
267	print >>fp, " \"%s\"," % name
268	print >>fp, " NULL"
269	print >>fp, "};"
270
271	# split record index table
272	index1, index2, shift = splitbins(index, trace)
273
274	print >>fp, "/* index tables for the database records */"
275	print >>fp, "#define SHIFT", shift
276	Array("index1", index1).dump(fp, trace)
277	Array("index2", index2).dump(fp, trace)
278
279	# split decomposition index table
280	index1, index2, shift = splitbins(decomp_index, trace)
281
282	print >>fp, "/* decomposition data */"
283	Array("decomp_data", decomp_data).dump(fp, trace)
284
285	print >>fp, "/* index tables for the decomposition data */"
286	print >>fp, "#define DECOMP_SHIFT", shift
287	Array("decomp_index1", index1).dump(fp, trace)
288	Array("decomp_index2", index2).dump(fp, trace)
289
290	index, index2, shift = splitbins(comp_data, trace)
291	print >>fp, "/* NFC pairs */"
292	print >>fp, "#define COMP_SHIFT", shift
293	Array("comp_index", index).dump(fp, trace)
294	Array("comp_data", index2).dump(fp, trace)
295
296	# Generate delta tables for old versions
297	for version, table, normalization in unicode.changed:
298	cversion = version.replace(".","_")
299	records = [table[0]]
300	cache = {table[0]:0}
301	index = [0] * len(table)
302	for i, record in enumerate(table):
303	try:
304	index[i] = cache[record]
305	except KeyError:
306	index[i] = cache[record] = len(records)
307	records.append(record)
308	index1, index2, shift = splitbins(index, trace)
309	print >>fp, "static const change_record change_records_%s[] = {" % cversion
310	for record in records:
311	print >>fp, "\t{ %s }," % ", ".join(map(str,record))
312	print >>fp, "};"
313	Array("changes_%s_index" % cversion, index1).dump(fp, trace)
314	Array("changes_%s_data" % cversion, index2).dump(fp, trace)
315	print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
316	print >>fp, "{"
317	print >>fp, "\tint index;"
318	print >>fp, "\tif (n >= 0x110000) index = 0;"
319	print >>fp, "\telse {"
320	print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
321	print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
322	(cversion, shift, ((1<<shift)-1))
323	print >>fp, "\t}"
324	print >>fp, "\treturn change_records_%s+index;" % cversion
325	print >>fp, "}\n"
326	print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
327	print >>fp, "{"
328	print >>fp, "\tswitch(n) {"
329	for k, v in normalization:
330	print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
331	print >>fp, "\tdefault: return 0;"
332	print >>fp, "\t}\n}\n"
333
334	fp.close()
335
336	# --------------------------------------------------------------------
337	# unicode character type tables
338
339	def makeunicodetype(unicode, trace):
340
341	FILE = "Objects/unicodetype_db.h"
342
343	print "--- Preparing", FILE, "..."
344
345	# extract unicode types
346	dummy = (0, 0, 0, 0, 0, 0)
347	table = [dummy]
348	cache = {0: dummy}
349	index = [0] * len(unicode.chars)
350
351	for char in unicode.chars:
352	record = unicode.table[char]
353	if record:
354	# extract database properties
355	category = record[2]
356	bidirectional = record[4]
357	flags = 0
358	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
359	flags \|= ALPHA_MASK
360	if category == "Ll":
361	flags \|= LOWER_MASK
362	if category == "Zl" or bidirectional == "B":
363	flags \|= LINEBREAK_MASK
364	if category == "Zs" or bidirectional in ("WS", "B", "S"):
365	flags \|= SPACE_MASK
366	if category == "Lt":
367	flags \|= TITLE_MASK
368	if category == "Lu":
369	flags \|= UPPER_MASK
370	# use delta predictor for upper/lower/title
371	if record[12]:
372	upper = int(record[12], 16) - char
373	assert -32768 <= upper <= 32767
374	upper = upper & 0xffff
375	else:
376	upper = 0
377	if record[13]:
378	lower = int(record[13], 16) - char
379	assert -32768 <= lower <= 32767
380	lower = lower & 0xffff
381	else:
382	lower = 0
383	if record[14]:
384	title = int(record[14], 16) - char
385	assert -32768 <= lower <= 32767
386	title = title & 0xffff
387	else:
388	title = 0
389	# decimal digit, integer digit
390	decimal = 0
391	if record[6]:
392	flags \|= DECIMAL_MASK
393	decimal = int(record[6])
394	digit = 0
395	if record[7]:
396	flags \|= DIGIT_MASK
397	digit = int(record[7])
398	item = (
399	upper, lower, title, decimal, digit, flags
400	)
401	# add entry to index and item tables
402	i = cache.get(item)
403	if i is None:
404	cache[item] = i = len(table)
405	table.append(item)
406	index[char] = i
407
408	print len(table), "unique character type entries"
409
410	print "--- Writing", FILE, "..."
411
412	fp = open(FILE, "w")
413	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
414	print >>fp
415	print >>fp, "/* a list of unique character type descriptors */"
416	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
417	for item in table:
418	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
419	print >>fp, "};"
420	print >>fp
421
422	# split decomposition index table
423	index1, index2, shift = splitbins(index, trace)
424
425	print >>fp, "/* type indexes */"
426	print >>fp, "#define SHIFT", shift
427	Array("index1", index1).dump(fp, trace)
428	Array("index2", index2).dump(fp, trace)
429
430	fp.close()
431
432	# --------------------------------------------------------------------
433	# unicode name database
434
435	def makeunicodename(unicode, trace):
436
437	FILE = "Modules/unicodename_db.h"
438
439	print "--- Preparing", FILE, "..."
440
441	# collect names
442	names = [None] * len(unicode.chars)
443
444	for char in unicode.chars:
445	record = unicode.table[char]
446	if record:
447	name = record[1].strip()
448	if name and name[0] != "<":
449	names[char] = name + chr(0)
450
451	print len(filter(lambda n: n is not None, names)), "distinct names"
452
453	# collect unique words from names (note that we differ between
454	# words inside a sentence, and words ending a sentence. the
455	# latter includes the trailing null byte.
456
457	words = {}
458	n = b = 0
459	for char in unicode.chars:
460	name = names[char]
461	if name:
462	w = name.split()
463	b = b + len(name)
464	n = n + len(w)
465	for w in w:
466	l = words.get(w)
467	if l:
468	l.append(None)
469	else:
470	words[w] = [len(words)]
471
472	print n, "words in text;", b, "bytes"
473
474	wordlist = words.items()
475
476	# sort on falling frequency, then by name
477	def cmpwords((aword, alist),(bword, blist)):
478	r = -cmp(len(alist),len(blist))
479	if r:
480	return r
481	return cmp(aword, bword)
482	wordlist.sort(cmpwords)
483
484	# figure out how many phrasebook escapes we need
485	escapes = 0
486	while escapes * 256 < len(wordlist):
487	escapes = escapes + 1
488	print escapes, "escapes"
489
490	short = 256 - escapes
491
492	assert short > 0
493
494	print short, "short indexes in lexicon"
495
496	# statistics
497	n = 0
498	for i in range(short):
499	n = n + len(wordlist[i][1])
500	print n, "short indexes in phrasebook"
501
502	# pick the most commonly used words, and sort the rest on falling
503	# length (to maximize overlap)
504
505	wordlist, wordtail = wordlist[:short], wordlist[short:]
506	wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
507	wordlist.extend(wordtail)
508
509	# generate lexicon from words
510
511	lexicon_offset = [0]
512	lexicon = ""
513	words = {}
514
515	# build a lexicon string
516	offset = 0
517	for w, x in wordlist:
518	# encoding: bit 7 indicates last character in word (chr(128)
519	# indicates the last character in an entire string)
520	ww = w[:-1] + chr(ord(w[-1])+128)
521	# reuse string tails, when possible
522	o = lexicon.find(ww)
523	if o < 0:
524	o = offset
525	lexicon = lexicon + ww
526	offset = offset + len(w)
527	words[w] = len(lexicon_offset)
528	lexicon_offset.append(o)
529
530	lexicon = map(ord, lexicon)
531
532	# generate phrasebook from names and lexicon
533	phrasebook = [0]
534	phrasebook_offset = [0] * len(unicode.chars)
535	for char in unicode.chars:
536	name = names[char]
537	if name:
538	w = name.split()
539	phrasebook_offset[char] = len(phrasebook)
540	for w in w:
541	i = words[w]
542	if i < short:
543	phrasebook.append(i)
544	else:
545	# store as two bytes
546	phrasebook.append((i>>8) + short)
547	phrasebook.append(i&255)
548
549	assert getsize(phrasebook) == 1
550
551	#
552	# unicode name hash table
553
554	# extract names
555	data = []
556	for char in unicode.chars:
557	record = unicode.table[char]
558	if record:
559	name = record[1].strip()
560	if name and name[0] != "<":
561	data.append((name, char))
562
563	# the magic number 47 was chosen to minimize the number of
564	# collisions on the current data set. if you like, change it
565	# and see what happens...
566
567	codehash = Hash("code", data, 47)
568
569	print "--- Writing", FILE, "..."
570
571	fp = open(FILE, "w")
572	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
573	print >>fp
574	print >>fp, "#define NAME_MAXLEN", 256
575	print >>fp
576	print >>fp, "/* lexicon */"
577	Array("lexicon", lexicon).dump(fp, trace)
578	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
579
580	# split decomposition index table
581	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
582
583	print >>fp, "/* code->name phrasebook */"
584	print >>fp, "#define phrasebook_shift", shift
585	print >>fp, "#define phrasebook_short", short
586
587	Array("phrasebook", phrasebook).dump(fp, trace)
588	Array("phrasebook_offset1", offset1).dump(fp, trace)
589	Array("phrasebook_offset2", offset2).dump(fp, trace)
590
591	print >>fp, "/* name->code dictionary */"
592	codehash.dump(fp, trace)
593
594	fp.close()
595
596
597	def merge_old_version(version, new, old):
598	# Changes to exclusion file not implemented yet
599	if old.exclusions != new.exclusions:
600	raise NotImplementedError, "exclusions differ"
601
602	# In these change records, 0xFF means "no change"
603	bidir_changes = [0xFF]*0x110000
604	category_changes = [0xFF]*0x110000
605	decimal_changes = [0xFF]*0x110000
606	# In numeric data, 0 means "no change",
607	# -1 means "did not have a numeric value
608	numeric_changes = [0] * 0x110000
609	# normalization_changes is a list of key-value pairs
610	normalization_changes = []
611	for i in range(0x110000):
612	if new.table[i] is None:
613	# Characters unassigned in the new version ought to
614	# be unassigned in the old one
615	assert old.table[i] is None
616	continue
617	# check characters unassigned in the old version
618	if old.table[i] is None:
619	# category 0 is "unassigned"
620	category_changes[i] = 0
621	continue
622	# check characters that differ
623	if old.table[i] != new.table[i]:
624	for k in range(len(old.table[i])):
625	if old.table[i][k] != new.table[i][k]:
626	value = old.table[i][k]
627	if k == 2:
628	#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
629	category_changes[i] = CATEGORY_NAMES.index(value)
630	elif k == 4:
631	#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
632	bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
633	elif k == 5:
634	#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
635	# We assume that all normalization changes are in 1:1 mappings
636	assert " " not in value
637	normalization_changes.append((i, value))
638	elif k == 6:
639	#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
640	# we only support changes where the old value is a single digit
641	assert value in "0123456789"
642	decimal_changes[i] = int(value)
643	elif k == 8:
644	# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
645	# Since 0 encodes "no change", the old value is better not 0
646	assert value != "0" and value != "-1"
647	if not value: