Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

unicodedata.c@ 3506

Last change on this file since 3506 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 34.7 KB

Line
1	/* ------------------------------------------------------------------------
2
3	unicodedata -- Provides access to the Unicode 4.1 data base.
4
5	Data was extracted from the Unicode 4.1 UnicodeData.txt file.
6
7	Written by Marc-Andre Lemburg ([email protected]).
8	Modified for Python 2.0 by Fredrik Lundh ([email protected])
9	Modified by Martin v. Löwis ([email protected])
10
11	Copyright (c) Corporation for National Research Initiatives.
12
13	------------------------------------------------------------------------ */
14
15	#include "Python.h"
16	#include "ucnhash.h"
17	#include "structmember.h"
18
19	/* character properties */
20
21	typedef struct {
22	const unsigned char category; /* index into
23	_PyUnicode_CategoryNames */
24	const unsigned char combining; /* combining class value 0 - 255 */
25	const unsigned char bidirectional; /* index into
26	_PyUnicode_BidirectionalNames */
27	const unsigned char mirrored; /* true if mirrored in bidir mode */
28	const unsigned char east_asian_width; /* index into
29	_PyUnicode_EastAsianWidth */
30	} _PyUnicode_DatabaseRecord;
31
32	typedef struct change_record {
33	/* sequence of fields should be the same as in merge_old_version */
34	const unsigned char bidir_changed;
35	const unsigned char category_changed;
36	const unsigned char decimal_changed;
37	const int numeric_changed;
38	} change_record;
39
40	/* data file generated by Tools/unicode/makeunicodedata.py */
41	#include "unicodedata_db.h"
42
43	static const _PyUnicode_DatabaseRecord*
44	_getrecord_ex(Py_UCS4 code)
45	{
46	int index;
47	if (code >= 0x110000)
48	index = 0;
49	else {
50	index = index1[(code>>SHIFT)];
51	index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
52	}
53
54	return &_PyUnicode_Database_Records[index];
55	}
56
57	static const _PyUnicode_DatabaseRecord*
58	_getrecord(PyUnicodeObject* v)
59	{
60	return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
61	}
62
63	/* ------------- Previous-version API ------------------------------------- */
64	typedef struct previous_version {
65	PyObject_HEAD
66	const char *name;
67	const change_record* (*getrecord)(Py_UCS4);
68	Py_UCS4 (*normalization)(Py_UCS4);
69	} PreviousDBVersion;
70
71	#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
72
73	static PyMemberDef DB_members[] = {
74	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
75	{NULL}
76	};
77
78	// forward declaration
79	static PyTypeObject UCD_Type;
80
81	static PyObject*
82	new_previous_version(const charname, const change_record (*getrecord)(Py_UCS4),
83	Py_UCS4 (*normalization)(Py_UCS4))
84	{
85	PreviousDBVersion *self;
86	self = PyObject_New(PreviousDBVersion, &UCD_Type);
87	if (self == NULL)
88	return NULL;
89	self->name = name;
90	self->getrecord = getrecord;
91	self->normalization = normalization;
92	return (PyObject*)self;
93	}
94
95	/* --- Module API --------------------------------------------------------- */
96
97	PyDoc_STRVAR(unicodedata_decimal__doc__,
98	"decimal(unichr[, default])\n\
99	\n\
100	Returns the decimal value assigned to the Unicode character unichr\n\
101	as integer. If no such value is defined, default is returned, or, if\n\
102	not given, ValueError is raised.");
103
104	static PyObject *
105	unicodedata_decimal(PyObject self, PyObject args)
106	{
107	PyUnicodeObject *v;
108	PyObject *defobj = NULL;
109	int have_old = 0;
110	long rc;
111
112	if (!PyArg_ParseTuple(args, "O!\|O:decimal", &PyUnicode_Type, &v, &defobj))
113	return NULL;
114	if (PyUnicode_GET_SIZE(v) != 1) {
115	PyErr_SetString(PyExc_TypeError,
116	"need a single Unicode character as parameter");
117	return NULL;
118	}
119
120	if (self) {
121	const change_record old = get_old_record(self, PyUnicode_AS_UNICODE(v));
122	if (old->category_changed == 0) {
123	/* unassigned */
124	have_old = 1;
125	rc = -1;
126	}
127	else if (old->decimal_changed != 0xFF) {
128	have_old = 1;
129	rc = old->decimal_changed;
130	}
131	}
132
133	if (!have_old)
134	rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
135	if (rc < 0) {
136	if (defobj == NULL) {
137	PyErr_SetString(PyExc_ValueError,
138	"not a decimal");
139	return NULL;
140	}
141	else {
142	Py_INCREF(defobj);
143	return defobj;
144	}
145	}
146	return PyInt_FromLong(rc);
147	}
148
149	PyDoc_STRVAR(unicodedata_digit__doc__,
150	"digit(unichr[, default])\n\
151	\n\
152	Returns the digit value assigned to the Unicode character unichr as\n\
153	integer. If no such value is defined, default is returned, or, if\n\
154	not given, ValueError is raised.");
155
156	static PyObject *
157	unicodedata_digit(PyObject self, PyObject args)
158	{
159	PyUnicodeObject *v;
160	PyObject *defobj = NULL;
161	long rc;
162
163	if (!PyArg_ParseTuple(args, "O!\|O:digit", &PyUnicode_Type, &v, &defobj))
164	return NULL;
165	if (PyUnicode_GET_SIZE(v) != 1) {
166	PyErr_SetString(PyExc_TypeError,
167	"need a single Unicode character as parameter");
168	return NULL;
169	}
170	rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
171	if (rc < 0) {
172	if (defobj == NULL) {
173	PyErr_SetString(PyExc_ValueError, "not a digit");
174	return NULL;
175	}
176	else {
177	Py_INCREF(defobj);
178	return defobj;
179	}
180	}
181	return PyInt_FromLong(rc);
182	}
183
184	PyDoc_STRVAR(unicodedata_numeric__doc__,
185	"numeric(unichr[, default])\n\
186	\n\
187	Returns the numeric value assigned to the Unicode character unichr\n\
188	as float. If no such value is defined, default is returned, or, if\n\
189	not given, ValueError is raised.");
190
191	static PyObject *
192	unicodedata_numeric(PyObject self, PyObject args)
193	{
194	PyUnicodeObject *v;
195	PyObject *defobj = NULL;
196	int have_old = 0;
197	double rc;
198
199	if (!PyArg_ParseTuple(args, "O!\|O:numeric", &PyUnicode_Type, &v, &defobj))
200	return NULL;
201	if (PyUnicode_GET_SIZE(v) != 1) {
202	PyErr_SetString(PyExc_TypeError,
203	"need a single Unicode character as parameter");
204	return NULL;
205	}
206
207	if (self) {
208	const change_record old = get_old_record(self, PyUnicode_AS_UNICODE(v));
209	if (old->category_changed == 0) {
210	/* unassigned */
211	have_old = 1;
212	rc = -1.0;
213	}
214	else if (old->decimal_changed != 0xFF) {
215	have_old = 1;
216	rc = old->decimal_changed;
217	}
218	}
219
220	if (!have_old)
221	rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
222	if (rc == -1.0) {
223	if (defobj == NULL) {
224	PyErr_SetString(PyExc_ValueError, "not a numeric character");
225	return NULL;
226	}
227	else {
228	Py_INCREF(defobj);
229	return defobj;
230	}
231	}
232	return PyFloat_FromDouble(rc);
233	}
234
235	PyDoc_STRVAR(unicodedata_category__doc__,
236	"category(unichr)\n\
237	\n\
238	Returns the general category assigned to the Unicode character\n\
239	unichr as string.");
240
241	static PyObject *
242	unicodedata_category(PyObject self, PyObject args)
243	{
244	PyUnicodeObject *v;
245	int index;
246
247	if (!PyArg_ParseTuple(args, "O!:category",
248	&PyUnicode_Type, &v))
249	return NULL;
250	if (PyUnicode_GET_SIZE(v) != 1) {
251	PyErr_SetString(PyExc_TypeError,
252	"need a single Unicode character as parameter");
253	return NULL;
254	}
255	index = (int) _getrecord(v)->category;
256	if (self) {
257	const change_record old = get_old_record(self, PyUnicode_AS_UNICODE(v));
258	if (old->category_changed != 0xFF)
259	index = old->category_changed;
260	}
261	return PyString_FromString(_PyUnicode_CategoryNames[index]);
262	}
263
264	PyDoc_STRVAR(unicodedata_bidirectional__doc__,
265	"bidirectional(unichr)\n\
266	\n\
267	Returns the bidirectional category assigned to the Unicode character\n\
268	unichr as string. If no such value is defined, an empty string is\n\
269	returned.");
270
271	static PyObject *
272	unicodedata_bidirectional(PyObject self, PyObject args)
273	{
274	PyUnicodeObject *v;
275	int index;
276
277	if (!PyArg_ParseTuple(args, "O!:bidirectional",
278	&PyUnicode_Type, &v))
279	return NULL;
280	if (PyUnicode_GET_SIZE(v) != 1) {
281	PyErr_SetString(PyExc_TypeError,
282	"need a single Unicode character as parameter");
283	return NULL;
284	}
285	index = (int) _getrecord(v)->bidirectional;
286	if (self) {
287	const change_record old = get_old_record(self, PyUnicode_AS_UNICODE(v));
288	if (old->category_changed == 0)
289	index = 0; /* unassigned */
290	else if (old->bidir_changed != 0xFF)
291	index = old->bidir_changed;
292	}
293	return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
294	}
295
296	PyDoc_STRVAR(unicodedata_combining__doc__,
297	"combining(unichr)\n\
298	\n\
299	Returns the canonical combining class assigned to the Unicode\n\
300	character unichr as integer. Returns 0 if no combining class is\n\
301	defined.");
302
303	static PyObject *
304	unicodedata_combining(PyObject self, PyObject args)
305	{
306	PyUnicodeObject *v;
307	int index;
308
309	if (!PyArg_ParseTuple(args, "O!:combining",
310	&PyUnicode_Type, &v))
311	return NULL;
312	if (PyUnicode_GET_SIZE(v) != 1) {
313	PyErr_SetString(PyExc_TypeError,
314	"need a single Unicode character as parameter");
315	return NULL;
316	}
317	index = (int) _getrecord(v)->combining;
318	if (self) {
319	const change_record old = get_old_record(self, PyUnicode_AS_UNICODE(v));
320	if (old->category_changed == 0)
321	index = 0; /* unassigned */
322	}
323	return PyInt_FromLong(index);
324	}
325
326	PyDoc_STRVAR(unicodedata_mirrored__doc__,
327	"mirrored(unichr)\n\
328	\n\
329	Returns the mirrored property assigned to the Unicode character\n\
330	unichr as integer. Returns 1 if the character has been identified as\n\
331	a \"mirrored\" character in bidirectional text, 0 otherwise.");
332
333	static PyObject *
334	unicodedata_mirrored(PyObject self, PyObject args)
335	{
336	PyUnicodeObject *v;
337	int index;
338
339	if (!PyArg_ParseTuple(args, "O!:mirrored",
340	&PyUnicode_Type, &v))
341	return NULL;
342	if (PyUnicode_GET_SIZE(v) != 1) {
343	PyErr_SetString(PyExc_TypeError,
344	"need a single Unicode character as parameter");
345	return NULL;
346	}
347	index = (int) _getrecord(v)->mirrored;
348	if (self) {
349	const change_record old = get_old_record(self, PyUnicode_AS_UNICODE(v));
350	if (old->category_changed == 0)
351	index = 0; /* unassigned */
352	}
353	return PyInt_FromLong(index);
354	}
355
356	PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
357	"east_asian_width(unichr)\n\
358	\n\
359	Returns the east asian width assigned to the Unicode character\n\
360	unichr as string.");
361
362	static PyObject *
363	unicodedata_east_asian_width(PyObject self, PyObject args)
364	{
365	PyUnicodeObject *v;
366	int index;
367
368	if (!PyArg_ParseTuple(args, "O!:east_asian_width",
369	&PyUnicode_Type, &v))
370	return NULL;
371	if (PyUnicode_GET_SIZE(v) != 1) {
372	PyErr_SetString(PyExc_TypeError,
373	"need a single Unicode character as parameter");
374	return NULL;
375	}
376	index = (int) _getrecord(v)->east_asian_width;
377	if (self) {
378	const change_record old = get_old_record(self, PyUnicode_AS_UNICODE(v));
379	if (old->category_changed == 0)
380	index = 0; /* unassigned */
381	}
382	return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
383	}
384
385	PyDoc_STRVAR(unicodedata_decomposition__doc__,
386	"decomposition(unichr)\n\
387	\n\
388	Returns the character decomposition mapping assigned to the Unicode\n\
389	character unichr as string. An empty string is returned in case no\n\
390	such mapping is defined.");
391
392	static PyObject *
393	unicodedata_decomposition(PyObject self, PyObject args)
394	{
395	PyUnicodeObject *v;
396	char decomp[256];
397	int code, index, count, i;
398	unsigned int prefix_index;
399
400	if (!PyArg_ParseTuple(args, "O!:decomposition",
401	&PyUnicode_Type, &v))
402	return NULL;
403	if (PyUnicode_GET_SIZE(v) != 1) {
404	PyErr_SetString(PyExc_TypeError,
405	"need a single Unicode character as parameter");
406	return NULL;
407	}
408
409	code = (int) *PyUnicode_AS_UNICODE(v);
410
411	if (self) {
412	const change_record old = get_old_record(self, PyUnicode_AS_UNICODE(v));
413	if (old->category_changed == 0)
414	return PyString_FromString(""); /* unassigned */
415	}
416
417	if (code < 0 \|\| code >= 0x110000)
418	index = 0;
419	else {
420	index = decomp_index1[(code>>DECOMP_SHIFT)];
421	index = decomp_index2[(index<<DECOMP_SHIFT)+
422	(code&((1<<DECOMP_SHIFT)-1))];
423	}
424
425	/* high byte is number of hex bytes (usually one or two), low byte
426	is prefix code (from*/
427	count = decomp_data[index] >> 8;
428
429	/* XXX: could allocate the PyString up front instead
430	(strlen(prefix) + 5 * count + 1 bytes) */
431
432	/* Based on how index is calculated above and decomp_data is generated
433	from Tools/unicode/makeunicodedata.py, it should not be possible
434	to overflow decomp_prefix. */
435	prefix_index = decomp_data[index] & 255;
436	assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
437
438	/* copy prefix */
439	i = strlen(decomp_prefix[prefix_index]);
440	memcpy(decomp, decomp_prefix[prefix_index], i);
441
442	while (count-- > 0) {
443	if (i)
444	decomp[i++] = ' ';
445	assert((size_t)i < sizeof(decomp));
446	PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
447	decomp_data[++index]);
448	i += strlen(decomp + i);
449	}
450
451	decomp[i] = '\0';
452
453	return PyString_FromString(decomp);
454	}
455
456	static void
457	get_decomp_record(PyObject self, Py_UCS4 code, int index, int prefix, int count)
458	{
459	if (code >= 0x110000) {
460	*index = 0;
461	} else if (self && get_old_record(self, code)->category_changed==0) {
462	/* unassigned in old version */
463	*index = 0;
464	}
465	else {
466	*index = decomp_index1[(code>>DECOMP_SHIFT)];
467	index = decomp_index2[(index<<DECOMP_SHIFT)+
468	(code&((1<<DECOMP_SHIFT)-1))];
469	}
470
471	/* high byte is number of hex bytes (usually one or two), low byte
472	is prefix code (from*/
473	count = decomp_data[index] >> 8;
474	prefix = decomp_data[index] & 255;
475
476	(*index)++;
477	}
478
479	#define SBase 0xAC00
480	#define LBase 0x1100
481	#define VBase 0x1161
482	#define TBase 0x11A7
483	#define LCount 19
484	#define VCount 21
485	#define TCount 28
486	#define NCount (VCount*TCount)
487	#define SCount (LCount*NCount)
488
489	static PyObject*
490	nfd_nfkd(PyObject self, PyObject input, int k)
491	{
492	PyObject *result;
493	Py_UNICODE i, end, *o;
494	/* Longest decomposition in Unicode 3.2: U+FDFA */
495	Py_UNICODE stack[20];
496	Py_ssize_t space, isize;
497	int index, prefix, count, stackptr;
498	unsigned char prev, cur;
499
500	stackptr = 0;
501	isize = PyUnicode_GET_SIZE(input);
502	/* Overallocate atmost 10 characters. */
503	space = (isize > 10 ? 10 : isize) + isize;
504	result = PyUnicode_FromUnicode(NULL, space);
505	if (!result)
506	return NULL;
507	i = PyUnicode_AS_UNICODE(input);
508	end = i + isize;
509	o = PyUnicode_AS_UNICODE(result);
510
511	while (i < end) {
512	stack[stackptr++] = *i++;
513	while(stackptr) {
514	Py_UNICODE code = stack[--stackptr];
515	/* Hangul Decomposition adds three characters in
516	a single step, so we need atleast that much room. */
517	if (space < 3) {
518	Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
519	space += 10;
520	if (PyUnicode_Resize(&result, newsize) == -1)
521	return NULL;
522	o = PyUnicode_AS_UNICODE(result) + newsize - space;
523	}
524	/* Hangul Decomposition. */
525	if (SBase <= code && code < (SBase+SCount)) {
526	int SIndex = code - SBase;
527	int L = LBase + SIndex / NCount;
528	int V = VBase + (SIndex % NCount) / TCount;
529	int T = TBase + SIndex % TCount;
530	*o++ = L;
531	*o++ = V;
532	space -= 2;
533	if (T != TBase) {
534	*o++ = T;
535	space --;
536	}
537	continue;
538	}
539	/* normalization changes */
540	if (self) {
541	Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
542	if (value != 0) {
543	stack[stackptr++] = value;
544	continue;
545	}
546	}
547
548	/* Other decompositions. */
549	get_decomp_record(self, code, &index, &prefix, &count);
550
551	/* Copy character if it is not decomposable, or has a
552	compatibility decomposition, but we do NFD. */
553	if (!count \|\| (prefix && !k)) {
554	*o++ = code;
555	space--;
556	continue;
557	}
558	/* Copy decomposition onto the stack, in reverse
559	order. */
560	while(count) {
561	code = decomp_data[index + (--count)];
562	stack[stackptr++] = code;
563	}
564	}
565	}
566
567	/* Drop overallocation. Cannot fail. */
568	PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
569
570	/* Sort canonically. */
571	i = PyUnicode_AS_UNICODE(result);
572	prev = _getrecord_ex(*i)->combining;
573	end = i + PyUnicode_GET_SIZE(result);
574	for (i++; i < end; i++) {
575	cur = _getrecord_ex(*i)->combining;
576	if (prev == 0 \|\| cur == 0 \|\| prev <= cur) {
577	prev = cur;
578	continue;
579	}
580	/* Non-canonical order. Need to switch i with previous. /
581	o = i - 1;
582	while (1) {
583	Py_UNICODE tmp = o[1];
584	o[1] = o[0];
585	o[0] = tmp;
586	o--;
587	if (o < PyUnicode_AS_UNICODE(result))
588	break;
589	prev = _getrecord_ex(*o)->combining;
590	if (prev == 0 \|\| prev <= cur)
591	break;
592	}
593	prev = _getrecord_ex(*i)->combining;
594	}
595	return result;
596	}
597
598	static int
599	find_nfc_index(PyObject self, struct reindex nfc, Py_UNICODE code)
600	{
601	int index;
602	for (index = 0; nfc[index].start; index++) {
603	int start = nfc[index].start;
604	if (code < start)
605	return -1;
606	if (code <= start + nfc[index].count) {
607	int delta = code - start;
608	return nfc[index].index + delta;
609	}
610	}
611	return -1;
612	}
613
614	static PyObject*
615	nfc_nfkc(PyObject self, PyObject input, int k)
616	{
617	PyObject *result;
618	Py_UNICODE i, i1, o, end;
619	int f,l,index,index1,comb;
620	Py_UNICODE code;
621	Py_UNICODE *skipped[20];
622	int cskipped = 0;
623
624	result = nfd_nfkd(self, input, k);
625	if (!result)
626	return NULL;
627
628	/* We are going to modify result in-place.
629	If nfd_nfkd is changed to sometimes return the input,
630	this code needs to be reviewed. */
631	assert(result != input);
632
633	i = PyUnicode_AS_UNICODE(result);
634	end = i + PyUnicode_GET_SIZE(result);
635	o = PyUnicode_AS_UNICODE(result);
636
637	again:
638	while (i < end) {
639	for (index = 0; index < cskipped; index++) {
640	if (skipped[index] == i) {
641	/* *i character is skipped.
642	Remove from list. */
643	skipped[index] = skipped[cskipped-1];
644	cskipped--;
645	i++;
646	goto again; /* continue while */
647	}
648	}
649	/* Hangul Composition. We don't need to check for <LV,T>
650	pairs, since we always have decomposed data. */
651	if (LBase <= i && i < (LBase+LCount) &&
652	i + 1 < end &&
653	VBase <= i[1] && i[1] <= (VBase+VCount)) {
654	int LIndex, VIndex;
655	LIndex = i[0] - LBase;
656	VIndex = i[1] - VBase;
657	code = SBase + (LIndexVCount+VIndex)TCount;
658	i+=2;
659	if (i < end &&
660	TBase <= i && i <= (TBase+TCount)) {
661	code += *i-TBase;
662	i++;
663	}
664	*o++ = code;
665	continue;
666	}
667
668	f = find_nfc_index(self, nfc_first, *i);
669	if (f == -1) {
670	o++ = i++;
671	continue;
672	}
673	/* Find next unblocked character. */
674	i1 = i+1;
675	comb = 0;
676	while (i1 < end) {
677	int comb1 = _getrecord_ex(*i1)->combining;
678	if (comb1 && comb == comb1) {
679	/* Character is blocked. */
680	i1++;
681	continue;
682	}
683	l = find_nfc_index(self, nfc_last, *i1);
684	/* i1 cannot be combined with i. If *i1
685	is a starter, we don't need to look further.
686	Otherwise, record the combining class. */
687	if (l == -1) {
688	not_combinable:
689	if (comb1 == 0)
690	break;
691	comb = comb1;
692	i1++;
693	continue;
694	}
695	index = f*TOTAL_LAST + l;
696	index1 = comp_index[index >> COMP_SHIFT];
697	code = comp_data[(index1<<COMP_SHIFT)+
698	(index&((1<<COMP_SHIFT)-1))];
699	if (code == 0)
700	goto not_combinable;
701
702	/* Replace the original character. */
703	*i = code;
704	/* Mark the second character unused. */
705	skipped[cskipped++] = i1;
706	i1++;
707	f = find_nfc_index(self, nfc_first, *i);
708	if (f == -1)
709	break;
710	}
711	o++ = i++;
712	}
713	if (o != end)
714	PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
715	return result;
716	}
717
718	PyDoc_STRVAR(unicodedata_normalize__doc__,
719	"normalize(form, unistr)\n\
720	\n\
721	Return the normal form 'form' for the Unicode string unistr. Valid\n\
722	values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
723
724	static PyObject*
725	unicodedata_normalize(PyObject self, PyObject args)
726	{
727	char *form;
728	PyObject *input;
729
730	if(!PyArg_ParseTuple(args, "sO!:normalize",
731	&form, &PyUnicode_Type, &input))
732	return NULL;
733
734	if (PyUnicode_GetSize(input) == 0) {
735	/* Special case empty input strings, since resizing
736	them later would cause internal errors. */
737	Py_INCREF(input);
738	return input;
739	}
740
741	if (strcmp(form, "NFC") == 0)
742	return nfc_nfkc(self, input, 0);
743	if (strcmp(form, "NFKC") == 0)
744	return nfc_nfkc(self, input, 1);
745	if (strcmp(form, "NFD") == 0)
746	return nfd_nfkd(self, input, 0);
747	if (strcmp(form, "NFKD") == 0)
748	return nfd_nfkd(self, input, 1);
749	PyErr_SetString(PyExc_ValueError, "invalid normalization form");
750	return NULL;
751	}
752
753	/* -------------------------------------------------------------------- */
754	/* unicode character name tables */
755
756	/* data file generated by Tools/unicode/makeunicodedata.py */
757	#include "unicodename_db.h"
758
759	/* -------------------------------------------------------------------- */
760	/* database code (cut and pasted from the unidb package) */
761
762	static unsigned long
763	_gethash(const char *s, int len, int scale)
764	{
765	int i;
766	unsigned long h = 0;
767	unsigned long ix;
768	for (i = 0; i < len; i++) {
769	h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
770	ix = h & 0xff000000;
771	if (ix)
772	h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
773	}
774	return h;
775	}
776
777	static char *hangul_syllables[][3] = {
778	{ "G", "A", "" },
779	{ "GG", "AE", "G" },
780	{ "N", "YA", "GG" },
781	{ "D", "YAE", "GS" },
782	{ "DD", "EO", "N", },
783	{ "R", "E", "NJ" },
784	{ "M", "YEO", "NH" },
785	{ "B", "YE", "D" },
786	{ "BB", "O", "L" },
787	{ "S", "WA", "LG" },
788	{ "SS", "WAE", "LM" },
789	{ "", "OE", "LB" },
790	{ "J", "YO", "LS" },
791	{ "JJ", "U", "LT" },
792	{ "C", "WEO", "LP" },
793	{ "K", "WE", "LH" },
794	{ "T", "WI", "M" },
795	{ "P", "YU", "B" },
796	{ "H", "EU", "BS" },
797	{ 0, "YI", "S" },
798	{ 0, "I", "SS" },
799	{ 0, 0, "NG" },
800	{ 0, 0, "J" },
801	{ 0, 0, "C" },
802	{ 0, 0, "K" },
803	{ 0, 0, "T" },
804	{ 0, 0, "P" },
805	{ 0, 0, "H" }
806	};
807
808	static int
809	is_unified_ideograph(Py_UCS4 code)
810	{
811	return (
812	(0x3400 <= code && code <= 0x4DB5) \|\| /* CJK Ideograph Extension A */
813	(0x4E00 <= code && code <= 0x9FBB) \|\| /* CJK Ideograph */
814	(0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
815	}
816
817	static int
818	_getucname(PyObject self, Py_UCS4 code, char buffer, int buflen)
819	{
820	int offset;
821	int i;
822	int word;
823	unsigned char* w;
824
825	if (code >= 0x110000)
826	return 0;
827
828	if (self) {
829	const change_record *old = get_old_record(self, code);
830	if (old->category_changed == 0) {
831	/* unassigned */
832	return 0;
833	}
834	}
835
836	if (SBase <= code && code < SBase+SCount) {
837	/* Hangul syllable. */
838	int SIndex = code - SBase;
839	int L = SIndex / NCount;
840	int V = (SIndex % NCount) / TCount;
841	int T = SIndex % TCount;
842
843	if (buflen < 27)
844	/* Worst case: HANGUL SYLLABLE <10chars>. */
845	return 0;
846	strcpy(buffer, "HANGUL SYLLABLE ");
847	buffer += 16;
848	strcpy(buffer, hangul_syllables[L][0]);
849	buffer += strlen(hangul_syllables[L][0]);
850	strcpy(buffer, hangul_syllables[V][1]);
851	buffer += strlen(hangul_syllables[V][1]);
852	strcpy(buffer, hangul_syllables[T][2]);
853	buffer += strlen(hangul_syllables[T][2]);
854	*buffer = '\0';
855	return 1;
856	}
857
858	if (is_unified_ideograph(code)) {
859	if (buflen < 28)
860	/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
861	return 0;
862	sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
863	return 1;
864	}
865
866	/* get offset into phrasebook */
867	offset = phrasebook_offset1[(code>>phrasebook_shift)];
868	offset = phrasebook_offset2[(offset<<phrasebook_shift) +
869	(code&((1<<phrasebook_shift)-1))];
870	if (!offset)
871	return 0;
872
873	i = 0;
874
875	for (;;) {
876	/* get word index */
877	word = phrasebook[offset] - phrasebook_short;
878	if (word >= 0) {
879	word = (word << 8) + phrasebook[offset+1];
880	offset += 2;
881	} else
882	word = phrasebook[offset++];
883	if (i) {
884	if (i > buflen)
885	return 0; /* buffer overflow */
886	buffer[i++] = ' ';
887	}
888	/* copy word string from lexicon. the last character in the
889	word has bit 7 set. the last word in a string ends with
890	0x80 */
891	w = lexicon + lexicon_offset[word];
892	while (*w < 128) {
893	if (i >= buflen)
894	return 0; /* buffer overflow */
895	buffer[i++] = *w++;
896	}
897	if (i >= buflen)
898	return 0; /* buffer overflow */
899	buffer[i++] = *w & 127;
900	if (*w == 128)
901	break; /* end of word */
902	}
903
904	return 1;
905	}
906
907	static int
908	_cmpname(PyObject self, int code, const char name, int namelen)
909	{
910	/* check if code corresponds to the given name */
911	int i;
912	char buffer[NAME_MAXLEN];
913	if (!_getucname(self, code, buffer, sizeof(buffer)))
914	return 0;
915	for (i = 0; i < namelen; i++) {
916	if (toupper(Py_CHARMASK(name[i])) != buffer[i])
917	return 0;
918	}
919	return buffer[namelen] == '\0';
920	}
921
922	static void
923	find_syllable(const char str, int len, int *pos, int count, int column)
924	{
925	int i, len1;
926	*len = -1;
927	for (i = 0; i < count; i++) {
928	char *s = hangul_syllables[i][column];
929	len1 = strlen(s);
930	if (len1 <= *len)
931	continue;
932	if (strncmp(str, s, len1) == 0) {
933	*len = len1;
934	*pos = i;
935	}
936	}
937	if (*len == -1) {
938	*len = 0;
939	}
940	}
941
942	static int
943	_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
944	{
945	unsigned int h, v;
946	unsigned int mask = code_size-1;
947	unsigned int i, incr;
948
949	/* Check for hangul syllables. */
950	if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
951	int len, L = -1, V = -1, T = -1;
952	const char *pos = name + 16;
953	find_syllable(pos, &len, &L, LCount, 0);
954	pos += len;
955	find_syllable(pos, &len, &V, VCount, 1);
956	pos += len;
957	find_syllable(pos, &len, &T, TCount, 2);
958	pos += len;
959	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
960	code = SBase + (LVCount+V)*TCount + T;
961	return 1;
962	}
963	/* Otherwise, it's an illegal syllable name. */
964	return 0;
965	}
966
967	/* Check for unified ideographs. */
968	if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
969	/* Four or five hexdigits must follow. */
970	v = 0;
971	name += 22;
972	namelen -= 22;
973	if (namelen != 4 && namelen != 5)
974	return 0;
975	while (namelen--) {
976	v *= 16;
977	if (name >= '0' && name <= '9')
978	v += *name - '0';
979	else if (name >= 'A' && name <= 'F')
980	v += *name - 'A' + 10;
981	else
982	return 0;
983	name++;
984	}
985	if (!is_unified_ideograph(v))
986	return 0;
987	*code = v;
988	return 1;
989	}
990
991	/* the following is the same as python's dictionary lookup, with
992	only minor changes. see the makeunicodedata script for more
993	details */
994
995	h = (unsigned int) _gethash(name, namelen, code_magic);
996	i = (~h) & mask;
997	v = code_hash[i];
998	if (!v)
999	return 0;
1000	if (_cmpname(self, v, name, namelen)) {
1001	*code = v;
1002	return 1;
1003	}
1004	incr = (h ^ (h >> 3)) & mask;
1005	if (!incr)
1006	incr = mask;
1007	for (;;) {
1008	i = (i + incr) & mask;
1009	v = code_hash[i];
1010	if (!v)
1011	return 0;
1012	if (_cmpname(self, v, name, namelen)) {
1013	*code = v;
1014	return 1;
1015	}
1016	incr = incr << 1;
1017	if (incr > mask)
1018	incr = incr ^ code_poly;
1019	}
1020	}
1021
1022	static const _PyUnicode_Name_CAPI hashAPI =
1023	{
1024	sizeof(_PyUnicode_Name_CAPI),
1025	_getucname,
1026	_getcode
1027	};
1028
1029	/* -------------------------------------------------------------------- */
1030	/* Python bindings */
1031
1032	PyDoc_STRVAR(unicodedata_name__doc__,
1033	"name(unichr[, default])\n\
1034	Returns the name assigned to the Unicode character unichr as a\n\
1035	string. If no name is defined, default is returned, or, if not\n\
1036	given, ValueError is raised.");
1037
1038	static PyObject *
1039	unicodedata_name(PyObject* self, PyObject* args)
1040	{
1041	char name[NAME_MAXLEN];
1042
1043	PyUnicodeObject* v;
1044	PyObject* defobj = NULL;
1045	if (!PyArg_ParseTuple(args, "O!\|O:name", &PyUnicode_Type, &v, &defobj))
1046	return NULL;
1047
1048	if (PyUnicode_GET_SIZE(v) != 1) {
1049	PyErr_SetString(PyExc_TypeError,
1050	"need a single Unicode character as parameter");
1051	return NULL;
1052	}
1053
1054	if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
1055	name, sizeof(name))) {
1056	if (defobj == NULL) {
1057	PyErr_SetString(PyExc_ValueError, "no such name");
1058	return NULL;
1059	}
1060	else {
1061	Py_INCREF(defobj);
1062	return defobj;
1063	}
1064	}
1065
1066	return Py_BuildValue("s", name);
1067	}
1068
1069	PyDoc_STRVAR(unicodedata_lookup__doc__,
1070	"lookup(name)\n\
1071	\n\
1072	Look up character by name. If a character with the\n\
1073	given name is found, return the corresponding Unicode\n\
1074	character. If not found, KeyError is raised.");
1075
1076	static PyObject *
1077	unicodedata_lookup(PyObject* self, PyObject* args)
1078	{
1079	Py_UCS4 code;
1080	Py_UNICODE str[1];
1081	char errbuf[256];
1082
1083	char* name;
1084	int namelen;
1085	if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1086	return NULL;
1087
1088	if (!_getcode(self, name, namelen, &code)) {
1089	/* XXX(nnorwitz): why are we allocating for the error msg?
1090	Why not always use snprintf? */
1091	char fmt[] = "undefined character name '%s'";
1092	char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
1093	if (buf)
1094	sprintf(buf, fmt, name);
1095	else {
1096	buf = errbuf;
1097	PyOS_snprintf(buf, sizeof(errbuf), fmt, name);
1098	}
1099	PyErr_SetString(PyExc_KeyError, buf);
1100	if (buf != errbuf)
1101	PyMem_FREE(buf);
1102	return NULL;
1103	}
1104
1105	str[0] = (Py_UNICODE) code;
1106	return PyUnicode_FromUnicode(str, 1);
1107	}
1108
1109	/* XXX Add doc strings. */
1110
1111	static PyMethodDef unicodedata_functions[] = {
1112	{"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1113	{"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1114	{"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1115	{"category", unicodedata_category, METH_VARARGS,
1116	unicodedata_category__doc__},
1117	{"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1118	unicodedata_bidirectional__doc__},
1119	{"combining", unicodedata_combining, METH_VARARGS,
1120	unicodedata_combining__doc__},
1121	{"mirrored", unicodedata_mirrored, METH_VARARGS,
1122	unicodedata_mirrored__doc__},
1123	{"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1124	unicodedata_east_asian_width__doc__},
1125	{"decomposition", unicodedata_decomposition, METH_VARARGS,
1126	unicodedata_decomposition__doc__},
1127	{"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1128	{"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1129	{"normalize", unicodedata_normalize, METH_VARARGS,
1130	unicodedata_normalize__doc__},
1131	{NULL, NULL} /* sentinel */
1132	};
1133
1134	static PyTypeObject UCD_Type = {
1135	/* The ob_type field must be initialized in the module init function
1136	* to be portable to Windows without using C++. */
1137	PyObject_HEAD_INIT(NULL)
1138	0, /ob_size/
1139	"unicodedata.UCD", /tp_name/
1140	sizeof(PreviousDBVersion), /tp_basicsize/
1141	0, /tp_itemsize/
1142	/* methods */
1143	(destructor)PyObject_Del, /tp_dealloc/
1144	0, /tp_print/
1145	0, /tp_getattr/
1146	0, /tp_setattr/
1147	0, /tp_compare/
1148	0, /tp_repr/
1149	0, /tp_as_number/
1150	0, /tp_as_sequence/
1151	0, /tp_as_mapping/
1152	0, /tp_hash/
1153	0, /tp_call/
1154	0, /tp_str/
1155	PyObject_GenericGetAttr,/tp_getattro/
1156	0, /tp_setattro/
1157	0, /tp_as_buffer/
1158	Py_TPFLAGS_DEFAULT, /tp_flags/
1159	0, /tp_doc/
1160	0, /tp_traverse/
1161	0, /tp_clear/
1162	0, /tp_richcompare/
1163	0, /tp_weaklistoffset/
1164	0, /tp_iter/
1165	0, /tp_iternext/
1166	unicodedata_functions, /tp_methods/
1167	DB_members, /tp_members/
1168	0, /tp_getset/
1169	0, /tp_base/
1170	0, /tp_dict/
1171	0, /tp_descr_get/
1172	0, /tp_descr_set/
1173	0, /tp_dictoffset/
1174	0, /tp_init/
1175	0, /tp_alloc/
1176	0, /tp_new/
1177	0, /tp_free/
1178	0, /tp_is_gc/
1179	};
1180
1181	PyDoc_STRVAR(unicodedata_docstring,
1182	"This module provides access to the Unicode Character Database which\n\
1183	defines character properties for all Unicode characters. The data in\n\
1184	this database is based on the UnicodeData.txt file version\n\
1185	4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1186	\n\
1187	The module uses the same names and symbols as defined by the\n\
1188	UnicodeData File Format 4.1.0 (see\n\
1189	http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
1190
1191	PyMODINIT_FUNC
1192	initunicodedata(void)
1193	{
1194	PyObject m, v;
1195
1196	UCD_Type.ob_type = &PyType_Type;
1197
1198	m = Py_InitModule3(
1199	"unicodedata", unicodedata_functions, unicodedata_docstring);
1200	if (!m)
1201	return;
1202
1203	PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1204	Py_INCREF(&UCD_Type);
1205	PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1206
1207	/* Previous versions */
1208	v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1209	if (v != NULL)
1210	PyModule_AddObject(m, "ucd_3_2_0", v);
1211
1212	/* Export C API */
1213	v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1214	if (v != NULL)
1215	PyModule_AddObject(m, "ucnhash_CAPI", v);
1216	}
1217
1218	/*
1219	Local variables:
1220	c-basic-offset: 4
1221	indent-tabs-mode: nil
1222	End:
1223	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Modules/unicodedata.c@ 3506

Download in other formats: