source: trunk/essentials/dev-lang/python/Tools/unicode/mkstringprep.py@ 3397

Last change on this file since 3397 was 3225, checked in by bird, 19 years ago

Python 2.5

File size: 9.8 KB
Line 
1import re, unicodedata, sys
2
3if sys.maxunicode == 65535:
4 raise RuntimeError, "need UCS-4 Python"
5
6def gen_category(cats):
7 for i in range(0, 0x110000):
8 if unicodedata.category(unichr(i)) in cats:
9 yield(i)
10
11def gen_bidirectional(cats):
12 for i in range(0, 0x110000):
13 if unicodedata.bidirectional(unichr(i)) in cats:
14 yield(i)
15
16def compact_set(l):
17 single = []
18 tuple = []
19 prev = None
20 span = 0
21 for e in l:
22 if prev is None:
23 prev = e
24 span = 0
25 continue
26 if prev+span+1 != e:
27 if span > 2:
28 tuple.append((prev,prev+span+1))
29 else:
30 for i in range(prev, prev+span+1):
31 single.append(i)
32 prev = e
33 span = 0
34 else:
35 span += 1
36 if span:
37 tuple.append((prev,prev+span+1))
38 else:
39 single.append(prev)
40 tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
41 if not single:
42 return "set(%s)" % tuple
43 if not tuple:
44 return "set(%s)" % repr(single)
45 return "set(%s + %s)" % (repr(single),tuple)
46
47############## Read the tables in the RFC #######################
48
49data = open("rfc3454.txt").readlines()
50
51tables = []
52curname = None
53for l in data:
54 l = l.strip()
55 if not l:
56 continue
57 # Skip RFC page breaks
58 if l.startswith("Hoffman & Blanchet") or\
59 l.startswith("RFC 3454"):
60 continue
61 # Find start/end lines
62 m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
63 if m:
64 if m.group(1) == "Start":
65 if curname:
66 raise "Double Start",(curname, l)
67 curname = m.group(2)
68 table = {}
69 tables.append((curname, table))
70 continue