source: trunk/essentials/dev-lang/python/Tools/unicode/makeunicodedata.py@ 3310

Last change on this file since 3310 was 3225, checked in by bird, 19 years ago

Python 2.5

File size: 30.6 KB
Line 
1#
2# (re)generate unicode property and type databases
3#
4# this script converts a unicode 3.2 database file to
5# Modules/unicodedata_db.h, Modules/unicodename_db.h,
6# and Objects/unicodetype_db.h
7#
8# history:
9# 2000-09-24 fl created (based on bits and pieces from unidb)
10# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
11# 2000-09-25 fl added character type table
12# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
13# 2000-11-03 fl expand first/last ranges
14# 2001-01-19 fl added character name tables (2.1)
15# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
16# 2002-09-11 wd use string methods
17# 2002-10-18 mvl update to Unicode 3.2
18# 2002-10-22 mvl generate NFC tables
19# 2002-11-24 mvl expand all ranges, sort names version-independently
20# 2002-11-25 mvl add UNIDATA_VERSION
21# 2004-05-29 perky add east asian width information
22# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
23#
24# written by Fredrik Lundh ([email protected])
25#
26
27import sys
28
29SCRIPT = sys.argv[0]
30VERSION = "2.5"
31
32# The Unicode Database
33UNIDATA_VERSION = "4.1.0"
34UNICODE_DATA = "UnicodeData%s.txt"
35COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
36EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
37
38old_versions = ["3.2.0"]
39
40CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
41 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
42 "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
43 "So" ]
44
45BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
46 "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
47 "ON" ]
48
49EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
50
51# note: should match definitions in Objects/unicodectype.c
52ALPHA_MASK = 0x01
53DECIMAL_MASK = 0x02
54DIGIT_MASK = 0x04
55LOWER_MASK = 0x08
56LINEBREAK_MASK = 0x10
57SPACE_MASK = 0x20
58TITLE_MASK = 0x40
59UPPER_MASK = 0x80
60
61def maketables(trace=0):
62
63 print "--- Reading", UNICODE_DATA % "", "..."
64
65 version = ""
66 unicode = UnicodeData(UNICODE_DATA % version,
67 COMPOSITION_EXCLUSIONS % version,
68 EASTASIAN_WIDTH % version)
69
70 print len(filter(None, unicode.table)), "characters"
71
72 for version in old_versions:
73 print "--- Reading", UNICODE_DATA % ("-"+version), "..."
74 old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
75 COMPOSITION_EXCLUSIONS % ("-"+version),
76 EASTASIAN_WIDTH % ("-"+version))
77 print len(filter(None, old_unicode.table)), "characters"
78 merge_old_version(version, unicode, old_unicode)
79
80 makeunicodename(unicode, trace)
81 makeunicodedata(unicode, trace)
82 makeunicodetype(unicode, trace)
83
84# --------------------------------------------------------------------
85# unicode character properties
86
87def makeunicodedata(unicode, trace):
88
89 dummy = (0, 0, 0, 0, 0)
90 table = [dummy]
91 cache = {0: dummy}
92 index = [0] * len(unicode.chars)
93
94 FILE = "Modules/unicodedata_db.h"
95
96 print "--- Preparing", FILE, "..."
97
98 # 1) database properties
99
100 for char in unicode.chars:
101 record = unicode.table[char]
102 if record:
103 # extract database properties
104 category = CATEGORY_NAMES.index(record[2])
105 combining = int(record[3])
106 bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
107 mirrored = record[9] == "Y"
108 eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
109 item = (
110 category, combining, bidirectional, mirrored, eastasianwidth
111 )
112 # add entry to index and item tables
113 i = cache.get(item)
114 if i is None:
115 cache[item] = i = len(table)
116 table.append(item)
117 index[char] = i
118
119 # 2) decomposition data
120
121 decomp_data = [0]
122 decomp_prefix = [""]
123 decomp_index = [0] * len(unicode.chars)
124 decomp_size = 0
125
126 comp_pairs = []
127 comp_first = [None] * len(unicode.chars)
128 comp_last = [None] * len(unicode.chars)
129
130 for char in unicode.chars:
131 record = unicode.table[char]
132 if record:
133 if record[5]:
134 decomp = record[5].split()
135 if len(decomp) > 19:
136 raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
137 # prefix
138 if decomp[0][0] == "<":
139 prefix = decomp.pop(0)
140 else:
141 prefix = ""
142 try:
143 i = decomp_prefix.index(prefix)
144 except ValueError:
145 i = len(decomp_prefix)
146 decomp_prefix.append(prefix)
147 prefix = i
148 assert prefix < 256
149 # content
150 decomp = [prefix + (len(decomp)<<8)] +\
151 map(lambda s: int(s, 16), decomp)
152 # Collect NFC pairs
153 if not prefix and len(decomp) == 3 and \
154 char not in unicode.exclusions and \
155 unicode.table[decomp[1]][3] == "0":
156 p, l, r = decomp
157 comp_first[l] = 1
158 comp_last[r] = 1
159 comp_pairs.append((l,r,char))
160 try:
161 i = decomp_data.index(decomp)
162 except ValueError:
163 i = len(decomp_data)
164 decomp_data.extend(decomp)
165 decomp_size = decomp_size + len(decomp) * 2
166 else:
167 i = 0
168 decomp_index[char] = i
169
170 f = l = 0
171 comp_first_ranges = []
172 comp_last_ranges = []
173 prev_f = prev_l = None
174 for i in unicode.chars:
175 if comp_first[i] is not None:
176 comp_first[i] = f
177 f += 1
178 if prev_f is None:
179 prev_f = (i,i)
180 elif prev_f[1]+1 == i:
181 prev_f = prev_f[0],i
182 else:
183 comp_first_ranges.append(prev_f)
184 prev_f = (i,i)
185 if comp_last[i] is not None:
186 comp_last[i] = l
187 l += 1
188 if prev_l is None:
189 prev_l = (i,i)
190 elif prev_l[1]+1 == i:
191 prev_l = prev_l[0],i
192 else:
193 comp_last_ranges.append(prev_l)
194 prev_l = (i,i)
195 comp_first_ranges.append(prev_f)
196 comp_last_ranges.append(prev_l)
197 total_first = f
198 total_last = l
199
200 comp_data = [0]*(total_first*total_last)
201 for f,l,char in comp_pairs:
202 f = comp_first[f]
203 l = comp_last[l]
204 comp_data[f*total_last+l] = char
205
206 print len(table), "unique properties"
207 print len(decomp_prefix), "unique decomposition prefixes"
208 print len(decomp_data), "unique decomposition entries:",
209 print decomp_size, "bytes"
210 print total_first, "first characters in NFC"
211 print total_last, "last characters in NFC"
212 print len(comp_pairs), "NFC pairs"
213
214 print "--- Writing", FILE, "..."
215
216 fp = open(FILE, "w")
217 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
218 print >>fp
219 print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
220 print >>fp, "/* a list of unique database records */"
221 print >>fp, \
222 "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
223 for item in table:
224 print >>fp, " {%d, %d, %d, %d, %d}," % item
225 print >>fp, "};"
226 print >>fp
227
228 print >>fp, "/* Reindexing of NFC first characters. */"
229 print >>fp, "#define TOTAL_FIRST",total_first
230 print >>fp, "#define TOTAL_LAST",total_last
231 print >>fp, "struct reindex{int start;short count,index;};"
232 print >>fp, "struct reindex nfc_first[] = {"
233 for start,end in comp_first_ranges:
234 print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
235 print >>fp," {0,0,0}"
236 print >>fp,"};\n"
237 print >>fp, "struct reindex nfc_last[] = {"
238 for start,end in comp_last_ranges:
239 print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
240 print >>fp," {0,0,0}"
241 print >>fp,"};\n"
242
243 # FIXME: <fl> the following tables could be made static, and
244 # the support code moved into unicodedatabase.c
245
246 print >>fp, "/* string literals */"
247 print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
248 for name in CATEGORY_NAMES:
249 print >>fp, " \"%s\"," % name
250 print >>fp, " NULL"
251 print >>fp, "};"
252
253 print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
254 for name in BIDIRECTIONAL_NAMES:
255 print >>fp, " \"%s\"," % name
256 print >>fp, " NULL"
257 print >>fp, "};"
258
259 print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
260 for name in EASTASIANWIDTH_NAMES:
261 print >>fp, " \"%s\"," % name
262 print >>fp, " NULL"
263 print >>fp, "};"
264
265 print >>fp, "static const char *decomp_prefix[] = {"
266 for name in decomp_prefix:
267 print >>fp, " \"%s\"," % name
268 print >>fp, " NULL"
269 print >>fp, "};"
270
271 # split record index table
272 index1, index2, shift = splitbins(index, trace)
273
274 print >>fp, "/* index tables for the database records */"
275 print >>fp, "#define SHIFT", shift
276 Array("index1", index1).dump(fp, trace)
277 Array("index2", index2).dump(fp, trace)
278
279 # split decomposition index table
280 index1, index2, shift = splitbins(decomp_index, trace)
281
282 print >>fp, "/* decomposition data */"
283 Array("decomp_data", decomp_data).dump(fp, trace)
284
285 print >>fp, "/* index tables for the decomposition data */"
286 print >>fp, "#define DECOMP_SHIFT", shift
287 Array("decomp_index1", index1).dump(fp, trace)
288 Array("decomp_index2", index2).dump(fp, trace)
289
290 index, index2, shift = splitbins(comp_data, trace)
291 print >>fp, "/* NFC pairs */"
292 print >>fp, "#define COMP_SHIFT", shift
293 Array("comp_index", index).dump(fp, trace)
294 Array("comp_data", index2).dump(fp, trace)
295
296 # Generate delta tables for old versions
297 for version, table, normalization in unicode.changed:
298 cversion = version.replace(".","_")
299 records = [table[0]]
300 cache = {table[0]:0}
301 index = [0] * len(table)
302 for i, record in enumerate(table):
303 try:
304 index[i] = cache[record]
305 except KeyError:
306 index[i] = cache[record] = len(records)
307 records.append(record)
308 index1, index2, shift = splitbins(index, trace)
309 print >>fp, "static const change_record change_records_%s[] = {" % cversion
310 for record in records:
311 print >>fp, "\t{ %s }," % ", ".join(map(str,record))
312 print >>fp, "};"
313 Array("changes_%s_index" % cversion, index1).dump(fp, trace)
314 Array("changes_%s_data" % cversion, index2).dump(fp, trace)
315 print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
316 print >>fp, "{"
317 print >>fp, "\tint index;"
318 print >>fp, "\tif (n >= 0x110000) index = 0;"
319 print >>fp, "\telse {"
320 print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
321 print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
322 (cversion, shift, ((1<<shift)-1))
323 print >>fp, "\t}"
324 print >>fp, "\treturn change_records_%s+index;" % cversion
325 print >>fp, "}\n"
326 print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
327 print >>fp, "{"
328 print >>fp, "\tswitch(n) {"
329 for k, v in normalization:
330 print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
331 print >>fp, "\tdefault: return 0;"
332 print >>fp, "\t}\n}\n"
333
334 fp.close()
335
336# --------------------------------------------------------------------
337# unicode character type tables
338
339def makeunicodetype(unicode, trace):
340
341 FILE = "Objects/unicodetype_db.h"
342
343 print "--- Preparing", FILE, "..."
344
345 # extract unicode types
346 dummy = (0, 0, 0, 0, 0, 0)
347 table = [dummy]
348 cache = {0: dummy}
349 index = [0] * len(unicode.chars)
350
351 for char in unicode.chars:
352 record = unicode.table[char]
353 if record:
354 # extract database properties
355 category = record[2]
356 bidirectional = record[4]
357 flags = 0
358 if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
359 flags |= ALPHA_MASK
360 if category == "Ll":
361 flags |= LOWER_MASK
362 if category == "Zl" or bidirectional == "B":
363 flags |= LINEBREAK_MASK
364 if category == "Zs" or bidirectional in ("WS", "B", "S"):
365 flags |= SPACE_MASK
366 if category == "Lt":
367 flags |= TITLE_MASK
368 if category == "Lu":
369 flags |= UPPER_MASK
370 # use delta predictor for upper/lower/title
371 if record[12]:
372 upper = int(record[12], 16) - char
373 assert -32768 <= upper <= 32767
374 upper = upper & 0xffff
375 else:
376 upper = 0
377 if record[13]:
378 lower = int(record[13], 16) - char
379 assert -32768 <= lower <= 32767
380 lower = lower & 0xffff
381 else:
382 lower = 0
383 if record[14]:
384 title = int(record[14], 16) - char
385 assert -32768 <= lower <= 32767
386 title = title & 0xffff
387 else:
388 title = 0
389 # decimal digit, integer digit
390 decimal = 0
391 if record[6]:
392 flags |= DECIMAL_MASK
393 decimal = int(record[6])
394 digit = 0
395 if record[7]:
396 flags |= DIGIT_MASK
397 digit = int(record[7])
398 item = (
399 upper, lower, title, decimal, digit, flags
400 )
401 # add entry to index and item tables
402 i = cache.get(item)
403 if i is None:
404 cache[item] = i = len(table)
405 table.append(item)
406 index[char] = i
407
408 print len(table), "unique character type entries"
409
410 print "--- Writing", FILE, "..."
411
412 fp = open(FILE, "w")
413 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
414 print >>fp
415 print >>fp, "/* a list of unique character type descriptors */"
416 print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
417 for item in table:
418 print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
419 print >>fp, "};"
420 print >>fp
421
422 # split decomposition index table
423 index1, index2, shift = splitbins(index, trace)
424
425 print >>fp, "/* type indexes */"
426 print >>fp, "#define SHIFT", shift
427 Array("index1", index1).dump(fp, trace)
428 Array("index2", index2).dump(fp, trace)
429
430 fp.close()
431
432# --------------------------------------------------------------------
433# unicode name database
434
435def makeunicodename(unicode, trace):
436
437 FILE = "Modules/unicodename_db.h"
438
439 print "--- Preparing", FILE, "..."
440
441 # collect names
442 names = [None] * len(unicode.chars)
443
444 for char in unicode.chars:
445 record = unicode.table[char]
446 if record:
447 name = record[1].strip()
448 if name and name[0] != "<":
449 names[char] = name + chr(0)
450
451 print len(filter(lambda n: n is not None, names)), "distinct names"
452
453 # collect unique words from names (note that we differ between
454 # words inside a sentence, and words ending a sentence. the
455 # latter includes the trailing null byte.
456
457 words = {}
458 n = b = 0
459 for char in unicode.chars:
460 name = names[char]
461 if name:
462 w = name.split()
463 b = b + len(name)
464 n = n + len(w)
465 for w in w:
466 l = words.get(w)
467 if l:
468 l.append(None)
469 else:
470 words[w] = [len(words)]
471
472 print n, "words in text;", b, "bytes"
473
474 wordlist = words.items()
475
476 # sort on falling frequency, then by name
477 def cmpwords((aword, alist),(bword, blist)):
478 r = -cmp(len(alist),len(blist))
479 if r:
480 return r
481 return cmp(aword, bword)
482 wordlist.sort(cmpwords)
483
484 # figure out how many phrasebook escapes we need
485 escapes = 0
486 while escapes * 256 < len(wordlist):
487 escapes = escapes + 1
488 print escapes, "escapes"
489
490 short = 256 - escapes
491
492 assert short > 0
493
494 print short, "short indexes in lexicon"
495
496 # statistics
497 n = 0
498 for i in range(short):
499 n = n + len(wordlist[i][1])
500 print n, "short indexes in phrasebook"
501
502 # pick the most commonly used words, and sort the rest on falling
503 # length (to maximize overlap)
504
505 wordlist, wordtail = wordlist[:short], wordlist[short:]
506 wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
507 wordlist.extend(wordtail)
508
509 # generate lexicon from words
510
511 lexicon_offset = [0]
512 lexicon = ""
513 words = {}
514
515 # build a lexicon string
516 offset = 0
517 for w, x in wordlist:
518 # encoding: bit 7 indicates last character in word (chr(128)
519 # indicates the last character in an entire string)
520 ww = w[:-1] + chr(ord(w[-1])+128)
521 # reuse string tails, when possible
522 o = lexicon.find(ww)
523 if o < 0:
524 o = offset
525 lexicon = lexicon + ww
526 offset = offset + len(w)
527 words[w] = len(lexicon_offset)
528 lexicon_offset.append(o)
529
530 lexicon = map(ord, lexicon)
531
532 # generate phrasebook from names and lexicon
533 phrasebook = [0]
534 phrasebook_offset = [0] * len(unicode.chars)
535 for char in unicode.chars:
536 name = names[char]
537 if name:
538 w = name.split()
539 phrasebook_offset[char] = len(phrasebook)
540 for w in w:
541 i = words[w]
542 if i < short:
543 phrasebook.append(i)
544 else:
545 # store as two bytes
546 phrasebook.append((i>>8) + short)
547 phrasebook.append(i&255)
548
549 assert getsize(phrasebook) == 1
550
551 #
552 # unicode name hash table
553
554 # extract names
555 data = []
556 for char in unicode.chars:
557 record = unicode.table[char]
558 if record:
559 name = record[1].strip()
560 if name and name[0] != "<":
561 data.append((name, char))
562
563 # the magic number 47 was chosen to minimize the number of
564 # collisions on the current data set. if you like, change it
565 # and see what happens...
566
567 codehash = Hash("code", data, 47)
568
569 print "--- Writing", FILE, "..."
570
571 fp = open(FILE, "w")
572 print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
573 print >>fp
574 print >>fp, "#define NAME_MAXLEN", 256
575 print >>fp
576 print >>fp, "/* lexicon */"
577 Array("lexicon", lexicon).dump(fp, trace)
578 Array("lexicon_offset", lexicon_offset).dump(fp, trace)
579
580 # split decomposition index table
581 offset1, offset2, shift = splitbins(phrasebook_offset, trace)
582
583 print >>fp, "/* code->name phrasebook */"
584 print >>fp, "#define phrasebook_shift", shift
585 print >>fp, "#define phrasebook_short", short
586
587 Array("phrasebook", phrasebook).dump(fp, trace)
588 Array("phrasebook_offset1", offset1).dump(fp, trace)
589 Array("phrasebook_offset2", offset2).dump(fp, trace)
590
591 print >>fp, "/* name->code dictionary */"
592 codehash.dump(fp, trace)
593
594 fp.close()
595
596
597def merge_old_version(version, new, old):
598 # Changes to exclusion file not implemented yet
599 if old.exclusions != new.exclusions:
600 raise NotImplementedError, "exclusions differ"
601
602 # In these change records, 0xFF means "no change"
603 bidir_changes = [0xFF]*0x110000
604 category_changes = [0xFF]*0x110000
605 decimal_changes = [0xFF]*0x110000
606 # In numeric data, 0 means "no change",
607 # -1 means "did not have a numeric value
608 numeric_changes = [0] * 0x110000
609 # normalization_changes is a list of key-value pairs
610 normalization_changes = []
611 for i in range(0x110000):
612 if new.table[i] is None:
613 # Characters unassigned in the new version ought to
614 # be unassigned in the old one
615 assert old.table[i] is None
616 continue
617 # check characters unassigned in the old version
618 if old.table[i] is None:
619 # category 0 is "unassigned"
620 category_changes[i] = 0
621 continue
622 # check characters that differ
623 if old.table[i] != new.table[i]:
624 for k in range(len(old.table[i])):
625 if old.table[i][k] != new.table[i][k]:
626 value = old.table[i][k]
627 if k == 2:
628 #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
629 category_changes[i] = CATEGORY_NAMES.index(value)
630 elif k == 4:
631 #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
632 bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
633 elif k == 5:
634 #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
635 # We assume that all normalization changes are in 1:1 mappings
636 assert " " not in value
637 normalization_changes.append((i, value))
638 elif k == 6:
639 #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
640 # we only support changes where the old value is a single digit
641 assert value in "0123456789"
642 decimal_changes[i] = int(value)
643 elif k == 8:
644 # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
645 # Since 0 encodes "no change", the old value is better not 0
646 assert value != "0" and value != "-1"
647 if not value: