Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

makeunicodedata.py@ 3391

Last change on this file since 3391 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 30.6 KB

Line
1	#
2	# (re)generate unicode property and type databases
3	#
4	# this script converts a unicode 3.2 database file to
5	# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6	# and Objects/unicodetype_db.h
7	#
8	# history:
9	# 2000-09-24 fl created (based on bits and pieces from unidb)
10	# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11	# 2000-09-25 fl added character type table
12	# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13	# 2000-11-03 fl expand first/last ranges
14	# 2001-01-19 fl added character name tables (2.1)
15	# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16	# 2002-09-11 wd use string methods
17	# 2002-10-18 mvl update to Unicode 3.2
18	# 2002-10-22 mvl generate NFC tables
19	# 2002-11-24 mvl expand all ranges, sort names version-independently
20	# 2002-11-25 mvl add UNIDATA_VERSION
21	# 2004-05-29 perky add east asian width information
22	# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23	#
24	# written by Fredrik Lundh ([email protected])
25	#
26
27	import sys
28
29	SCRIPT = sys.argv[0]
30	VERSION = "2.5"
31
32	# The Unicode Database
33	UNIDATA_VERSION = "4.1.0"
34	UNICODE_DATA = "UnicodeData%s.txt"
35	COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36	EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38	old_versions = ["3.2.0"]
39
40	CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41	"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42	"Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43	"So" ]
44
45	BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46	"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47	"ON" ]
48
49	EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
51	# note: should match definitions in Objects/unicodectype.c
52	ALPHA_MASK = 0x01
53	DECIMAL_MASK = 0x02
54	DIGIT_MASK = 0x04
55	LOWER_MASK = 0x08
56	LINEBREAK_MASK = 0x10
57	SPACE_MASK = 0x20
58	TITLE_MASK = 0x40
59	UPPER_MASK = 0x80
60
61	def maketables(trace=0):
62
63	print "--- Reading", UNICODE_DATA % "", "..."
64
65	version = ""
66	unicode = UnicodeData(UNICODE_DATA % version,
67	COMPOSITION_EXCLUSIONS % version,
68	EASTASIAN_WIDTH % version)
69
70	print len(filter(None, unicode.table)), "characters"
71
72	for version in old_versions:
73	print "--- Reading", UNICODE_DATA % ("-"+version), "..."
74	old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
75	COMPOSITION_EXCLUSIONS % ("-"+version),
76	EASTASIAN_WIDTH % ("-"+version))
77	print len(filter(None, old_unicode.table)), "characters"
78	merge_old_version(version, unicode, old_unicode)
79
80	makeunicodename(unicode, trace)
81	makeunicodedata(unicode, trace)
82	makeunicodetype(unicode, trace)
83
84	# --------------------------------------------------------------------
85	# unicode character properties
86
87	def makeunicodedata(unicode, trace):
88
89	dummy = (0, 0, 0, 0, 0)
90	table = [dummy]
91	cache = {0: dummy}
92	index = [0] * len(unicode.chars)
93
94	FILE = "Modules/unicodedata_db.h"
95
96	print "--- Preparing", FILE, "..."
97
98	# 1) database properties
99
100	for char in unicode.chars:
101	record = unicode.table[char]
102	if record:
103	# extract database properties
104	category = CATEGORY_NAMES.index(record[2])
105	combining = int(record[3])
106	bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
107	mirrored = record[9] == "Y"
108	eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
109	item = (
110	category, combining, bidirectional, mirrored, eastasianwidth
111	)
112	# add entry to index and item tables
113	i = cache.get(item)
114	if i is None:
115	cache[item] = i = len(table)
116	table.append(item)
117	index[char] = i
118
119	# 2) decomposition data
120
121	decomp_data = [0]
122	decomp_prefix = [""]
123	decomp_index = [0] * len(unicode.chars)
124	decomp_size = 0
125
126	comp_pairs = []
127	comp_first = [None] * len(unicode.chars)
128	comp_last = [None] * len(unicode.chars)
129
130	for char in unicode.chars:
131	record = unicode.table[char]
132	if record:
133	if record[5]:
134	decomp = record[5].split()
135	if len(decomp) > 19:
136	raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
137	# prefix
138	if decomp[0][0] == "<":
139	prefix = decomp.pop(0)
140	else:
141	prefix = ""
142	try:
143	i = decomp_prefix.index(prefix)
144	except ValueError:
145	i = len(decomp_prefix)
146	decomp_prefix.append(prefix)
147	prefix = i
148	assert prefix < 256
149	# content
150	decomp = [prefix + (len(decomp)<<8)] +\
151	map(lambda s: int(s, 16), decomp)
152	# Collect NFC pairs
153	if not prefix and len(decomp) == 3 and \
154	char not in unicode.exclusions and \
155	unicode.table[decomp[1]][3] == "0":
156	p, l, r = decomp
157	comp_first[l] = 1
158	comp_last[r] = 1
159	comp_pairs.append((l,r,char))
160	try:
161	i = decomp_data.index(decomp)
162	except ValueError:
163	i = len(decomp_data)
164	decomp_data.extend(decomp)
165	decomp_size = decomp_size + len(decomp) * 2
166	else:
167	i = 0
168	decomp_index[char] = i
169
170	f = l = 0
171	comp_first_ranges = []
172	comp_last_ranges = []
173	prev_f = prev_l = None
174	for i in unicode.chars:
175	if comp_first[i] is not None:
176	comp_first[i] = f
177	f += 1
178	if prev_f is None:
179	prev_f = (i,i)
180	elif prev_f[1]+1 == i:
181	prev_f = prev_f[0],i
182	else:
183	comp_first_ranges.append(prev_f)
184	prev_f = (i,i)
185	if comp_last[i] is not None:
186	comp_last[i] = l
187	l += 1
188	if prev_l is None:
189	prev_l = (i,i)
190	elif prev_l[1]+1 == i:
191	prev_l = prev_l[0],i
192	else:
193	comp_last_ranges.append(prev_l)
194	prev_l = (i,i)
195	comp_first_ranges.append(prev_f)
196	comp_last_ranges.append(prev_l)
197	total_first = f
198	total_last = l
199
200	comp_data = [0](total_firsttotal_last)
201	for f,l,char in comp_pairs:
202	f = comp_first[f]
203	l = comp_last[l]
204	comp_data[f*total_last+l] = char
205
206	print len(table), "unique properties"
207	print len(decomp_prefix), "unique decomposition prefixes"
208	print len(decomp_data), "unique decomposition entries:",
209	print decomp_size, "bytes"
210	print total_first, "first characters in NFC"
211	print total_last, "last characters in NFC"
212	print len(comp_pairs), "NFC pairs"
213
214	print "--- Writing", FILE, "..."
215
216	fp = open(FILE, "w")
217	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
218	print >>fp
219	print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
220	print >>fp, "/* a list of unique database records */"
221	print >>fp, \
222	"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
223	for item in table:
224	print >>fp, " {%d, %d, %d, %d, %d}," % item
225	print >>fp, "};"
226	print >>fp
227
228	print >>fp, "/* Reindexing of NFC first characters. */"
229	print >>fp, "#define TOTAL_FIRST",total_first
230	print >>fp, "#define TOTAL_LAST",total_last
231	print >>fp, "struct reindex{int start;short count,index;};"
232	print >>fp, "struct reindex nfc_first[] = {"
233	for start,end in comp_first_ranges:
234	print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
235	print >>fp," {0,0,0}"
236	print >>fp,"};\n"
237	print >>fp, "struct reindex nfc_last[] = {"
238	for start,end in comp_last_ranges:
239	print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
240	print >>fp," {0,0,0}"
241	print >>fp,"};\n"
242
243	# FIXME: <fl> the following tables could be made static, and
244	# the support code moved into unicodedatabase.c
245
246	print >>fp, "/* string literals */"
247	print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
248	for name in CATEGORY_NAMES:
249	print >>fp, " \"%s\"," % name
250	print >>fp, " NULL"
251	print >>fp, "};"
252
253	print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
254	for name in BIDIRECTIONAL_NAMES:
255	print >>fp, " \"%s\"," % name
256	print >>fp, " NULL"
257	print >>fp, "};"
258
259	print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
260	for name in EASTASIANWIDTH_NAMES:
261	print >>fp, " \"%s\"," % name
262	print >>fp, " NULL"
263	print >>fp, "};"
264
265	print >>fp, "static const char *decomp_prefix[] = {"
266	for name in decomp_prefix:
267	print >>fp, " \"%s\"," % name
268	print >>fp, " NULL"
269	print >>fp, "};"
270
271	# split record index table
272	index1, index2, shift = splitbins(index, trace)
273
274	print >>fp, "/* index tables for the database records */"
275	print >>fp, "#define SHIFT", shift
276	Array("index1", index1).dump(fp, trace)
277	Array("index2", index2).dump(fp, trace)
278
279	# split decomposition index table
280	index1, index2, shift = splitbins(decomp_index, trace)
281
282	print >>fp, "/* decomposition data */"
283	Array("decomp_data", decomp_data).dump(fp, trace)
284
285	print >>fp, "/* index tables for the decomposition data */"
286	print >>fp, "#define DECOMP_SHIFT", shift
287	Array("decomp_index1", index1).dump(fp, trace)
288	Array("decomp_index2", index2).dump(fp, trace)
289
290	index, index2, shift = splitbins(comp_data, trace)
291	print >>fp, "/* NFC pairs */"
292	print >>fp, "#define COMP_SHIFT", shift
293	Array("comp_index", index).dump(fp, trace)
294	Array("comp_data", index2).dump(fp, trace)
295
296	# Generate delta tables for old versions
297	for version, table, normalization in unicode.changed:
298	cversion = version.replace(".","_")
299	records = [table[0]]
300	cache = {table[0]:0}
301	index = [0] * len(table)
302	for i, record in enumerate(table):
303	try:
304	index[i] = cache[record]
305	except KeyError:
306	index[i] = cache[record] = len(records)
307	records.append(record)
308	index1, index2, shift = splitbins(index, trace)
309	print >>fp, "static const change_record change_records_%s[] = {" % cversion
310	for record in records:
311	print >>fp, "\t{ %s }," % ", ".join(map(str,record))
312	print >>fp, "};"
313	Array("changes_%s_index" % cversion, index1).dump(fp, trace)
314	Array("changes_%s_data" % cversion, index2).dump(fp, trace)
315	print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
316	print >>fp, "{"
317	print >>fp, "\tint index;"
318	print >>fp, "\tif (n >= 0x110000) index = 0;"
319	print >>fp, "\telse {"
320	print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
321	print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
322	(cversion, shift, ((1<<shift)-1))
323	print >>fp, "\t}"
324	print >>fp, "\treturn change_records_%s+index;" % cversion
325	print >>fp, "}\n"
326	print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
327	print >>fp, "{"
328	print >>fp, "\tswitch(n) {"
329	for k, v in normalization:
330	print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
331	print >>fp, "\tdefault: return 0;"
332	print >>fp, "\t}\n}\n"
333
334	fp.close()
335
336	# --------------------------------------------------------------------
337	# unicode character type tables
338
339	def makeunicodetype(unicode, trace):
340
341	FILE = "Objects/unicodetype_db.h"
342
343	print "--- Preparing", FILE, "..."
344
345	# extract unicode types
346	dummy = (0, 0, 0, 0, 0, 0)
347	table = [dummy]
348	cache = {0: dummy}
349	index = [0] * len(unicode.chars)
350
351	for char in unicode.chars:
352	record = unicode.table[char]
353	if record:
354	# extract database properties
355	category = record[2]
356	bidirectional = record[4]
357	flags = 0
358	if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
359	flags \|= ALPHA_MASK
360	if category == "Ll":
361	flags \|= LOWER_MASK
362	if category == "Zl" or bidirectional == "B":
363	flags \|= LINEBREAK_MASK
364	if category == "Zs" or bidirectional in ("WS", "B", "S"):
365	flags \|= SPACE_MASK
366	if category == "Lt":
367	flags \|= TITLE_MASK
368	if category == "Lu":
369	flags \|= UPPER_MASK
370	# use delta predictor for upper/lower/title
371	if record[12]:
372	upper = int(record[12], 16) - char
373	assert -32768 <= upper <= 32767
374	upper = upper & 0xffff
375	else:
376	upper = 0
377	if record[13]:
378	lower = int(record[13], 16) - char
379	assert -32768 <= lower <= 32767
380	lower = lower & 0xffff
381	else:
382	lower = 0
383	if record[14]:
384	title = int(record[14], 16) - char
385	assert -32768 <= lower <= 32767
386	title = title & 0xffff
387	else:
388	title = 0
389	# decimal digit, integer digit
390	decimal = 0
391	if record[6]:
392	flags \|= DECIMAL_MASK
393	decimal = int(record[6])
394	digit = 0
395	if record[7]:
396	flags \|= DIGIT_MASK
397	digit = int(record[7])
398	item = (
399	upper, lower, title, decimal, digit, flags
400	)
401	# add entry to index and item tables
402	i = cache.get(item)
403	if i is None:
404	cache[item] = i = len(table)
405	table.append(item)
406	index[char] = i
407
408	print len(table), "unique character type entries"
409
410	print "--- Writing", FILE, "..."
411
412	fp = open(FILE, "w")
413	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
414	print >>fp
415	print >>fp, "/* a list of unique character type descriptors */"
416	print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
417	for item in table:
418	print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
419	print >>fp, "};"
420	print >>fp
421
422	# split decomposition index table
423	index1, index2, shift = splitbins(index, trace)
424
425	print >>fp, "/* type indexes */"
426	print >>fp, "#define SHIFT", shift
427	Array("index1", index1).dump(fp, trace)
428	Array("index2", index2).dump(fp, trace)
429
430	fp.close()
431
432	# --------------------------------------------------------------------
433	# unicode name database
434
435	def makeunicodename(unicode, trace):
436
437	FILE = "Modules/unicodename_db.h"
438
439	print "--- Preparing", FILE, "..."
440
441	# collect names
442	names = [None] * len(unicode.chars)
443
444	for char in unicode.chars:
445	record = unicode.table[char]
446	if record:
447	name = record[1].strip()
448	if name and name[0] != "<":
449	names[char] = name + chr(0)
450
451	print len(filter(lambda n: n is not None, names)), "distinct names"
452
453	# collect unique words from names (note that we differ between
454	# words inside a sentence, and words ending a sentence. the
455	# latter includes the trailing null byte.
456
457	words = {}
458	n = b = 0
459	for char in unicode.chars:
460	name = names[char]
461	if name:
462	w = name.split()
463	b = b + len(name)
464	n = n + len(w)
465	for w in w:
466	l = words.get(w)
467	if l:
468	l.append(None)
469	else:
470	words[w] = [len(words)]
471
472	print n, "words in text;", b, "bytes"
473
474	wordlist = words.items()
475
476	# sort on falling frequency, then by name
477	def cmpwords((aword, alist),(bword, blist)):
478	r = -cmp(len(alist),len(blist))
479	if r:
480	return r
481	return cmp(aword, bword)
482	wordlist.sort(cmpwords)
483
484	# figure out how many phrasebook escapes we need
485	escapes = 0
486	while escapes * 256 < len(wordlist):
487	escapes = escapes + 1
488	print escapes, "escapes"
489
490	short = 256 - escapes
491
492	assert short > 0
493
494	print short, "short indexes in lexicon"
495
496	# statistics
497	n = 0
498	for i in range(short):
499	n = n + len(wordlist[i][1])
500	print n, "short indexes in phrasebook"
501
502	# pick the most commonly used words, and sort the rest on falling
503	# length (to maximize overlap)
504
505	wordlist, wordtail = wordlist[:short], wordlist[short:]
506	wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
507	wordlist.extend(wordtail)
508
509	# generate lexicon from words
510
511	lexicon_offset = [0]
512	lexicon = ""
513	words = {}
514
515	# build a lexicon string
516	offset = 0
517	for w, x in wordlist:
518	# encoding: bit 7 indicates last character in word (chr(128)
519	# indicates the last character in an entire string)
520	ww = w[:-1] + chr(ord(w[-1])+128)
521	# reuse string tails, when possible
522	o = lexicon.find(ww)
523	if o < 0:
524	o = offset
525	lexicon = lexicon + ww
526	offset = offset + len(w)
527	words[w] = len(lexicon_offset)
528	lexicon_offset.append(o)
529
530	lexicon = map(ord, lexicon)
531
532	# generate phrasebook from names and lexicon
533	phrasebook = [0]
534	phrasebook_offset = [0] * len(unicode.chars)
535	for char in unicode.chars:
536	name = names[char]
537	if name:
538	w = name.split()
539	phrasebook_offset[char] = len(phrasebook)
540	for w in w:
541	i = words[w]
542	if i < short:
543	phrasebook.append(i)
544	else:
545	# store as two bytes
546	phrasebook.append((i>>8) + short)
547	phrasebook.append(i&255)
548
549	assert getsize(phrasebook) == 1
550
551	#
552	# unicode name hash table
553
554	# extract names
555	data = []
556	for char in unicode.chars:
557	record = unicode.table[char]
558	if record:
559	name = record[1].strip()
560	if name and name[0] != "<":
561	data.append((name, char))
562
563	# the magic number 47 was chosen to minimize the number of
564	# collisions on the current data set. if you like, change it
565	# and see what happens...
566
567	codehash = Hash("code", data, 47)
568
569	print "--- Writing", FILE, "..."
570
571	fp = open(FILE, "w")
572	print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
573	print >>fp
574	print >>fp, "#define NAME_MAXLEN", 256
575	print >>fp
576	print >>fp, "/* lexicon */"
577	Array("lexicon", lexicon).dump(fp, trace)
578	Array("lexicon_offset", lexicon_offset).dump(fp, trace)
579
580	# split decomposition index table
581	offset1, offset2, shift = splitbins(phrasebook_offset, trace)
582
583	print >>fp, "/* code->name phrasebook */"
584	print >>fp, "#define phrasebook_shift", shift
585	print >>fp, "#define phrasebook_short", short
586
587	Array("phrasebook", phrasebook).dump(fp, trace)
588	Array("phrasebook_offset1", offset1).dump(fp, trace)
589	Array("phrasebook_offset2", offset2).dump(fp, trace)
590
591	print >>fp, "/* name->code dictionary */"
592	codehash.dump(fp, trace)
593
594	fp.close()
595
596
597	def merge_old_version(version, new, old):
598	# Changes to exclusion file not implemented yet
599	if old.exclusions != new.exclusions:
600	raise NotImplementedError, "exclusions differ"
601
602	# In these change records, 0xFF means "no change"
603	bidir_changes = [0xFF]*0x110000
604	category_changes = [0xFF]*0x110000
605	decimal_changes = [0xFF]*0x110000
606	# In numeric data, 0 means "no change",
607	# -1 means "did not have a numeric value
608	numeric_changes = [0] * 0x110000
609	# normalization_changes is a list of key-value pairs
610	normalization_changes = []
611	for i in range(0x110000):
612	if new.table[i] is None:
613	# Characters unassigned in the new version ought to
614	# be unassigned in the old one
615	assert old.table[i] is None
616	continue
617	# check characters unassigned in the old version
618	if old.table[i] is None:
619	# category 0 is "unassigned"
620	category_changes[i] = 0
621	continue
622	# check characters that differ
623	if old.table[i] != new.table[i]:
624	for k in range(len(old.table[i])):
625	if old.table[i][k] != new.table[i][k]:
626	value = old.table[i][k]
627	if k == 2:
628	#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
629	category_changes[i] = CATEGORY_NAMES.index(value)
630	elif k == 4:
631	#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
632	bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
633	elif k == 5:
634	#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
635	# We assume that all normalization changes are in 1:1 mappings
636	assert " " not in value
637	normalization_changes.append((i, value))
638	elif k == 6:
639	#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
640	# we only support changes where the old value is a single digit
641	assert value in "0123456789"
642	decimal_changes[i] = int(value)
643	elif k == 8:
644	# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
645	# Since 0 encodes "no change", the old value is better not 0
646	assert value != "0" and value != "-1"
647	if not value:
648	numeric_changes[i] = -1
649	else:
650	assert re.match("^[0-9]+$", value)
651	numeric_changes[i] = int(value)
652	elif k == 11:
653	# change to ISO comment, ignore
654	pass
655	elif k == 12:
656	# change to simple uppercase mapping; ignore
657	pass
658	elif k == 13:
659	# change to simple lowercase mapping; ignore
660	pass
661	elif k == 14:
662	# change to simple titlecase mapping; ignore
663	pass
664	else:
665	class Difference(Exception):pass
666	raise Difference, (hex(i), k, old.table[i], new.table[i])
667	new.changed.append((version, zip(bidir_changes, category_changes,
668	decimal_changes, numeric_changes),
669	normalization_changes))
670
671
672	# --------------------------------------------------------------------
673	# the following support code is taken from the unidb utilities
674	# Copyright (c) 1999-2000 by Secret Labs AB
675
676	# load a unicode-data file from disk
677
678	import sys
679
680	class UnicodeData:
681
682	def __init__(self, filename, exclusions, eastasianwidth, expand=1):
683	self.changed = []
684	file = open(filename)
685	table = [None] * 0x110000
686	while 1:
687	s = file.readline()
688	if not s:
689	break
690	s = s.strip().split(";")
691	char = int(s[0], 16)
692	table[char] = s
693
694	# expand first-last ranges
695	if expand:
696	field = None
697	for i in range(0, 0x110000):
698	s = table[i]
699	if s:
700	if s[1][-6:] == "First>":
701	s[1] = ""
702	field = s
703	elif s[1][-5:] == "Last>":
704	s[1] = ""
705	field = None
706	elif field:
707	f2 = field[:]
708	f2[0] = "%X" % i
709	table[i] = f2
710
711	# public attributes
712	self.filename = filename
713	self.table = table
714	self.chars = range(0x110000) # unicode 3.2
715
716	file = open(exclusions)
717	self.exclusions = {}
718	for s in file:
719	s = s.strip()
720	if not s:
721	continue
722	if s[0] == '#':
723	continue
724	char = int(s.split()[0],16)
725	self.exclusions[char] = 1
726
727	widths = [None] * 0x110000
728	for s in open(eastasianwidth):
729	s = s.strip()
730	if not s:
731	continue
732	if s[0] == '#':
733	continue
734	s = s.split()[0].split(';')
735	if '..' in s[0]:
736	first, last = [int(c, 16) for c in s[0].split('..')]
737	chars = range(first, last+1)
738	else:
739	chars = [int(s[0], 16)]
740	for char in chars:
741	widths[char] = s[1]
742	for i in range(0, 0x110000):
743	if table[i] is not None:
744	table[i].append(widths[i])
745
746	def uselatin1(self):
747	# restrict character range to ISO Latin 1
748	self.chars = range(256)
749
750	# hash table tools
751
752	# this is a straight-forward reimplementation of Python's built-in
753	# dictionary type, using a static data structure, and a custom string
754	# hash algorithm.
755
756	def myhash(s, magic):
757	h = 0
758	for c in map(ord, s.upper()):
759	h = (h * magic) + c
760	ix = h & 0xff000000L
761	if ix:
762	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
763	return h
764
765	SIZES = [
766	(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
767	(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
768	(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
769	(2097152,5), (4194304,3), (8388608,33), (16777216,27)
770	]
771
772	class Hash:
773	def __init__(self, name, data, magic):
774	# turn a (key, value) list into a static hash table structure
775
776	# determine table size
777	for size, poly in SIZES:
778	if size > len(data):
779	poly = size + poly
780	break
781	else:
782	raise AssertionError, "ran out of polynominals"
783
784	print size, "slots in hash table"
785
786	table = [None] * size
787
788	mask = size-1
789
790	n = 0
791
792	hash = myhash
793
794	# initialize hash table
795	for key, value in data:
796	h = hash(key, magic)
797	i = (~h) & mask
798	v = table[i]
799	if v is None:
800	table[i] = value
801	continue
802	incr = (h ^ (h >> 3)) & mask;
803	if not incr:
804	incr = mask
805	while 1:
806	n = n + 1
807	i = (i + incr) & mask
808	v = table[i]
809	if v is None:
810	table[i] = value
811	break
812	incr = incr << 1
813	if incr > mask:
814	incr = incr ^ poly
815
816	print n, "collisions"
817	self.collisions = n
818
819	for i in range(len(table)):
820	if table[i] is None:
821	table[i] = 0
822
823	self.data = Array(name + "_hash", table)
824	self.magic = magic
825	self.name = name
826	self.size = size
827	self.poly = poly
828
829	def dump(self, file, trace):
830	# write data to file, as a C array
831	self.data.dump(file, trace)
832	file.write("#define %s_magic %d\n" % (self.name, self.magic))
833	file.write("#define %s_size %d\n" % (self.name, self.size))
834	file.write("#define %s_poly %d\n" % (self.name, self.poly))
835
836	# stuff to deal with arrays of unsigned integers
837
838	class Array:
839
840	def __init__(self, name, data):
841	self.name = name
842	self.data = data
843
844	def dump(self, file, trace=0):
845	# write data to file, as a C array
846	size = getsize(self.data)
847	if trace:
848	print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
849	file.write("static ")
850	if size == 1:
851	file.write("unsigned char")
852	elif size == 2:
853	file.write("unsigned short")
854	else:
855	file.write("unsigned int")
856	file.write(" " + self.name + "[] = {\n")
857	if self.data:
858	s = " "
859	for item in self.data:
860	i = str(item) + ", "
861	if len(s) + len(i) > 78:
862	file.write(s + "\n")
863	s = " " + i
864	else:
865	s = s + i
866	if s.strip():
867	file.write(s + "\n")
868	file.write("};\n\n")
869
870	def getsize(data):
871	# return smallest possible integer size for the given array
872	maxdata = max(data)
873	if maxdata < 256:
874	return 1
875	elif maxdata < 65536:
876	return 2
877	else:
878	return 4
879
880	def splitbins(t, trace=0):
881	"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
882
883	t is a sequence of ints. This function can be useful to save space if
884	many of the ints are the same. t1 and t2 are lists of ints, and shift
885	is an int, chosen to minimize the combined size of t1 and t2 (in C
886	code), and where for each i in range(len(t)),
887	t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
888	where mask is a bitmask isolating the last "shift" bits.
889
890	If optional arg trace is non-zero (default zero), progress info
891	is printed to sys.stderr. The higher the value, the more info
892	you'll get.
893	"""
894
895	import sys
896	if trace:
897	def dump(t1, t2, shift, bytes):
898	print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
899	len(t1), len(t2), shift, bytes)
900	print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
901	"bytes"
902	n = len(t)-1 # last valid index
903	maxshift = 0 # the most we can shift n and still have something left
904	if n > 0:
905	while n >> 1:
906	n >>= 1
907	maxshift += 1
908	del n
909	bytes = sys.maxint # smallest total size so far
910	t = tuple(t) # so slices can be dict keys
911	for shift in range(maxshift + 1):
912	t1 = []
913	t2 = []
914	size = 2**shift
915	bincache = {}
916	for i in range(0, len(t), size):
917	bin = t[i:i+size]
918	index = bincache.get(bin)
919	if index is None:
920	index = len(t2)
921	bincache[bin] = index
922	t2.extend(bin)
923	t1.append(index >> shift)
924	# determine memory size
925	b = len(t1)getsize(t1) + len(t2)getsize(t2)
926	if trace > 1:
927	dump(t1, t2, shift, b)
928	if b < bytes:
929	best = t1, t2, shift
930	bytes = b
931	t1, t2, shift = best
932	if trace:
933	print >>sys.stderr, "Best:",
934	dump(t1, t2, shift, bytes)
935	if __debug__:
936	# exhaustively verify that the decomposition is correct
937	mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
938	for i in xrange(len(t)):
939	assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
940	return best
941
942	if __name__ == "__main__":
943	maketables(1)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Tools/unicode/makeunicodedata.py@ 3391

Download in other formats: