| 1 | import re, unicodedata, sys
|
|---|
| 2 |
|
|---|
| 3 | if sys.maxunicode == 65535:
|
|---|
| 4 | raise RuntimeError, "need UCS-4 Python"
|
|---|
| 5 |
|
|---|
| 6 | def gen_category(cats):
|
|---|
| 7 | for i in range(0, 0x110000):
|
|---|
| 8 | if unicodedata.category(unichr(i)) in cats:
|
|---|
| 9 | yield(i)
|
|---|
| 10 |
|
|---|
| 11 | def gen_bidirectional(cats):
|
|---|
| 12 | for i in range(0, 0x110000):
|
|---|
| 13 | if unicodedata.bidirectional(unichr(i)) in cats:
|
|---|
| 14 | yield(i)
|
|---|
| 15 |
|
|---|
| 16 | def compact_set(l):
|
|---|
| 17 | single = []
|
|---|
| 18 | tuple = []
|
|---|
| 19 | prev = None
|
|---|
| 20 | span = 0
|
|---|
| 21 | for e in l:
|
|---|
| 22 | if prev is None:
|
|---|
| 23 | prev = e
|
|---|
| 24 | span = 0
|
|---|
| 25 | continue
|
|---|
| 26 | if prev+span+1 != e:
|
|---|
| 27 | if span > 2:
|
|---|
| 28 | tuple.append((prev,prev+span+1))
|
|---|
| 29 | else:
|
|---|
| 30 | for i in range(prev, prev+span+1):
|
|---|
| 31 | single.append(i)
|
|---|
| 32 | prev = e
|
|---|
| 33 | span = 0
|
|---|
| 34 | else:
|
|---|
| 35 | span += 1
|
|---|
| 36 | if span:
|
|---|
| 37 | tuple.append((prev,prev+span+1))
|
|---|
| 38 | else:
|
|---|
| 39 | single.append(prev)
|
|---|
| 40 | tuple = " + ".join(["range(%d,%d)" % t for t in tuple])
|
|---|
| 41 | if not single:
|
|---|
| 42 | return "set(%s)" % tuple
|
|---|
| 43 | if not tuple:
|
|---|
| 44 | return "set(%s)" % repr(single)
|
|---|
| 45 | return "set(%s + %s)" % (repr(single),tuple)
|
|---|
| 46 |
|
|---|
| 47 | ############## Read the tables in the RFC #######################
|
|---|
| 48 |
|
|---|
| 49 | data = open("rfc3454.txt").readlines()
|
|---|
| 50 |
|
|---|
| 51 | tables = []
|
|---|
| 52 | curname = None
|
|---|
| 53 | for l in data:
|
|---|
| 54 | l = l.strip()
|
|---|
| 55 | if not l:
|
|---|
| 56 | continue
|
|---|
| 57 | # Skip RFC page breaks
|
|---|
| 58 | if l.startswith("Hoffman & Blanchet") or\
|
|---|
| 59 | l.startswith("RFC 3454"):
|
|---|
| 60 | continue
|
|---|
| 61 | # Find start/end lines
|
|---|
| 62 | m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l)
|
|---|
| 63 | if m:
|
|---|
| 64 | if m.group(1) == "Start":
|
|---|
| 65 | if curname:
|
|---|
| 66 | raise "Double Start",(curname, l)
|
|---|
| 67 | curname = m.group(2)
|
|---|
| 68 | table = {}
|
|---|
| 69 | tables.append((curname, table))
|
|---|
| 70 | continue
|
|---|
|
|---|