Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

tokenizer.c@ 3506

Visit:

Last change on this file since 3506 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 33.1 KB

Line
1
2	/* Tokenizer implementation */
3
4	#include "Python.h"
5	#include "pgenheaders.h"
6
7	#include <ctype.h>
8	#include <assert.h>
9
10	#include "tokenizer.h"
11	#include "errcode.h"
12
13	#ifndef PGEN
14	#include "unicodeobject.h"
15	#include "stringobject.h"
16	#include "fileobject.h"
17	#include "codecs.h"
18	#include "abstract.h"
19	#endif /* PGEN */
20
21	extern char PyOS_Readline(FILE , FILE , char );
22	/* Return malloc'ed string including trailing \n;
23	empty malloc'ed string for EOF;
24	NULL if interrupted */
25
26	/* Don't ever change this -- it would break the portability of Python code */
27	#define TABSIZE 8
28
29	/* Convert a possibly signed character to a nonnegative int */
30	/* XXX This assumes characters are 8 bits wide */
31	#ifdef __CHAR_UNSIGNED__
32	#define Py_CHARMASK(c) (c)
33	#else
34	#define Py_CHARMASK(c) ((c) & 0xff)
35	#endif
36
37	/* Forward */
38	static struct tok_state *tok_new(void);
39	static int tok_nextc(struct tok_state *tok);
40	static void tok_backup(struct tok_state *tok, int c);
41
42	/* Token names */
43
44	char *_PyParser_TokenNames[] = {
45	"ENDMARKER",
46	"NAME",
47	"NUMBER",
48	"STRING",
49	"NEWLINE",
50	"INDENT",
51	"DEDENT",
52	"LPAR",
53	"RPAR",
54	"LSQB",
55	"RSQB",
56	"COLON",
57	"COMMA",
58	"SEMI",
59	"PLUS",
60	"MINUS",
61	"STAR",
62	"SLASH",
63	"VBAR",
64	"AMPER",
65	"LESS",
66	"GREATER",
67	"EQUAL",
68	"DOT",
69	"PERCENT",
70	"BACKQUOTE",
71	"LBRACE",
72	"RBRACE",
73	"EQEQUAL",
74	"NOTEQUAL",
75	"LESSEQUAL",
76	"GREATEREQUAL",
77	"TILDE",
78	"CIRCUMFLEX",
79	"LEFTSHIFT",
80	"RIGHTSHIFT",
81	"DOUBLESTAR",
82	"PLUSEQUAL",
83	"MINEQUAL",
84	"STAREQUAL",
85	"SLASHEQUAL",
86	"PERCENTEQUAL",
87	"AMPEREQUAL",
88	"VBAREQUAL",
89	"CIRCUMFLEXEQUAL",
90	"LEFTSHIFTEQUAL",
91	"RIGHTSHIFTEQUAL",
92	"DOUBLESTAREQUAL",
93	"DOUBLESLASH",
94	"DOUBLESLASHEQUAL",
95	"AT",
96	/* This table must match the #defines in token.h! */
97	"OP",
98	"<ERRORTOKEN>",
99	"<N_TOKENS>"
100	};
101
102
103	/* Create and initialize a new tok_state structure */
104
105	static struct tok_state *
106	tok_new(void)
107	{
108	struct tok_state tok = (struct tok_state )PyMem_MALLOC(
109	sizeof(struct tok_state));
110	if (tok == NULL)
111	return NULL;
112	tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
113	tok->done = E_OK;
114	tok->fp = NULL;
115	tok->tabsize = TABSIZE;
116	tok->indent = 0;
117	tok->indstack[0] = 0;
118	tok->atbol = 1;
119	tok->pendin = 0;
120	tok->prompt = tok->nextprompt = NULL;
121	tok->lineno = 0;
122	tok->level = 0;
123	tok->filename = NULL;
124	tok->altwarning = 0;
125	tok->alterror = 0;
126	tok->alttabsize = 1;
127	tok->altindstack[0] = 0;
128	tok->decoding_state = 0;
129	tok->decoding_erred = 0;
130	tok->read_coding_spec = 0;
131	tok->encoding = NULL;
132	tok->cont_line = 0;
133	#ifndef PGEN
134	tok->decoding_readline = NULL;
135	tok->decoding_buffer = NULL;
136	#endif
137	return tok;
138	}
139
140	#ifdef PGEN
141
142	static char *
143	decoding_fgets(char s, int size, struct tok_state tok)
144	{
145	return fgets(s, size, tok->fp);
146	}
147
148	static int
149	decoding_feof(struct tok_state *tok)
150	{
151	return feof(tok->fp);
152	}
153
154	static const char *
155	decode_str(const char str, struct tok_state tok)
156	{
157	return str;
158	}
159
160	#else /* PGEN */
161
162	static char *
163	error_ret(struct tok_state tok) / XXX */
164	{
165	tok->decoding_erred = 1;
166	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
167	PyMem_FREE(tok->buf);
168	tok->buf = NULL;
169	return NULL; /* as if it were EOF */
170	}
171
172	static char *
173	new_string(const char *s, Py_ssize_t len)
174	{
175	char* result = (char *)PyMem_MALLOC(len + 1);
176	if (result != NULL) {
177	memcpy(result, s, len);
178	result[len] = '\0';
179	}
180	return result;
181	}
182
183	static char *
184	get_normal_name(char s) / for utf-8 and latin-1 */
185	{
186	char buf[13];
187	int i;
188	for (i = 0; i < 12; i++) {
189	int c = s[i];
190	if (c == '\0') break;
191	else if (c == '_') buf[i] = '-';
192	else buf[i] = tolower(c);
193	}
194	buf[i] = '\0';
195	if (strcmp(buf, "utf-8") == 0 \|\|
196	strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
197	else if (strcmp(buf, "latin-1") == 0 \|\|
198	strcmp(buf, "iso-8859-1") == 0 \|\|
199	strcmp(buf, "iso-latin-1") == 0 \|\|
200	strncmp(buf, "latin-1-", 8) == 0 \|\|
201	strncmp(buf, "iso-8859-1-", 11) == 0 \|\|
202	strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
203	else return s;
204	}
205
206	/* Return the coding spec in S, or NULL if none is found. */
207
208	static char *
209	get_coding_spec(const char *s, Py_ssize_t size)
210	{
211	Py_ssize_t i;
212	/* Coding spec must be in a comment, and that comment must be
213	* the only statement on the source code line. */
214	for (i = 0; i < size - 6; i++) {
215	if (s[i] == '#')
216	break;
217	if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
218	return NULL;
219	}
220	for (; i < size - 6; i++) { /* XXX inefficient search */
221	const char* t = s + i;
222	if (strncmp(t, "coding", 6) == 0) {
223	const char* begin = NULL;
224	t += 6;
225	if (t[0] != ':' && t[0] != '=')
226	continue;
227	do {
228	t++;
229	} while (t[0] == '\x20' \|\| t[0] == '\t');
230
231	begin = t;
232	while (isalnum(Py_CHARMASK(t[0])) \|\|
233	t[0] == '-' \|\| t[0] == '_' \|\| t[0] == '.')
234	t++;
235
236	if (begin < t) {
237	char* r = new_string(begin, t - begin);
238	char* q = get_normal_name(r);
239	if (r != q) {
240	PyMem_FREE(r);
241	r = new_string(q, strlen(q));
242	}
243	return r;
244	}
245	}
246	}
247	return NULL;
248	}
249
250	/* Check whether the line contains a coding spec. If it does,
251	invoke the set_readline function for the new encoding.
252	This function receives the tok_state and the new encoding.
253	Return 1 on success, 0 on failure. */
254
255	static int
256	check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
257	int set_readline(struct tok_state , const char ))
258	{
259	char * cs;
260	int r = 1;
261
262	if (tok->cont_line)
263	/* It's a continuation line, so it can't be a coding spec. */
264	return 1;
265	cs = get_coding_spec(line, size);
266	if (cs != NULL) {
267	tok->read_coding_spec = 1;
268	if (tok->encoding == NULL) {
269	assert(tok->decoding_state == 1); /* raw */
270	if (strcmp(cs, "utf-8") == 0 \|\|
271	strcmp(cs, "iso-8859-1") == 0) {
272	tok->encoding = cs;
273	} else {
274	#ifdef Py_USING_UNICODE
275	r = set_readline(tok, cs);
276	if (r) {
277	tok->encoding = cs;
278	tok->decoding_state = -1;
279	}
280	else
281	PyMem_FREE(cs);
282	#else
283	/* Without Unicode support, we cannot
284	process the coding spec. Since there
285	won't be any Unicode literals, that
286	won't matter. */
287	PyMem_FREE(cs);
288	#endif
289	}
290	} else { /* then, compare cs with BOM */
291	r = (strcmp(tok->encoding, cs) == 0);
292	PyMem_FREE(cs);
293	}
294	}
295	if (!r) {
296	cs = tok->encoding;
297	if (!cs)
298	cs = "with BOM";
299	PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
300	}
301	return r;
302	}
303
304	/* See whether the file starts with a BOM. If it does,
305	invoke the set_readline function with the new encoding.
306	Return 1 on success, 0 on failure. */
307
308	static int
309	check_bom(int get_char(struct tok_state *),
310	void unget_char(int, struct tok_state *),
311	int set_readline(struct tok_state , const char ),
312	struct tok_state *tok)
313	{
314	int ch = get_char(tok);
315	tok->decoding_state = 1;
316	if (ch == EOF) {
317	return 1;
318	} else if (ch == 0xEF) {
319	ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
320	ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
321	#if 0
322	/* Disable support for UTF-16 BOMs until a decision
323	is made whether this needs to be supported. */
324	} else if (ch == 0xFE) {
325	ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
326	if (!set_readline(tok, "utf-16-be")) return 0;
327	tok->decoding_state = -1;
328	} else if (ch == 0xFF) {
329	ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
330	if (!set_readline(tok, "utf-16-le")) return 0;
331	tok->decoding_state = -1;
332	#endif
333	} else {
334	unget_char(ch, tok);
335	return 1;
336	}
337	if (tok->encoding != NULL)
338	PyMem_FREE(tok->encoding);
339	tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */
340	return 1;
341	NON_BOM:
342	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
343	unget_char(0xFF, tok); /* XXX this will cause a syntax error */
344	return 1;
345	}
346
347	/* Read a line of text from TOK into S, using the stream in TOK.
348	Return NULL on failure, else S.
349
350	On entry, tok->decoding_buffer will be one of:
351	1) NULL: need to call tok->decoding_readline to get a new line
352	2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
353	stored the result in tok->decoding_buffer
354	3) PyStringObject *: previous call to fp_readl did not have enough room
355	(in the s buffer) to copy entire contents of the line read
356	by tok->decoding_readline. tok->decoding_buffer has the overflow.
357	In this case, fp_readl is called in a loop (with an expanded buffer)
358	until the buffer ends with a '\n' (or until the end of the file is
359	reached): see tok_nextc and its calls to decoding_fgets.
360	*/
361
362	static char *
363	fp_readl(char s, int size, struct tok_state tok)
364	{
365	#ifndef Py_USING_UNICODE
366	/* In a non-Unicode built, this should never be called. */
367	Py_FatalError("fp_readl should not be called in this build.");
368	return NULL; /* Keep compiler happy (not reachable) */
369	#else
370	PyObject* utf8 = NULL;
371	PyObject* buf = tok->decoding_buffer;
372	char *str;
373	Py_ssize_t utf8len;
374
375	/* Ask for one less byte so we can terminate it */
376	assert(size > 0);
377	size--;
378
379	if (buf == NULL) {
380	buf = PyObject_CallObject(tok->decoding_readline, NULL);
381	if (buf == NULL)
382	return error_ret(tok);
383	} else {
384	tok->decoding_buffer = NULL;
385	if (PyString_CheckExact(buf))
386	utf8 = buf;
387	}
388	if (utf8 == NULL) {
389	utf8 = PyUnicode_AsUTF8String(buf);
390	Py_DECREF(buf);
391	if (utf8 == NULL)
392	return error_ret(tok);
393	}
394	str = PyString_AsString(utf8);
395	utf8len = PyString_GET_SIZE(utf8);
396	if (utf8len > size) {
397	tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);
398	if (tok->decoding_buffer == NULL) {
399	Py_DECREF(utf8);
400	return error_ret(tok);
401	}
402	utf8len = size;
403	}
404	memcpy(s, str, utf8len);
405	s[utf8len] = '\0';
406	Py_DECREF(utf8);
407	if (utf8len == 0) return NULL; /* EOF */
408	return s;
409	#endif
410	}
411
412	/* Set the readline function for TOK to a StreamReader's
413	readline function. The StreamReader is named ENC.
414
415	This function is called from check_bom and check_coding_spec.
416
417	ENC is usually identical to the future value of tok->encoding,
418	except for the (currently unsupported) case of UTF-16.
419
420	Return 1 on success, 0 on failure. */
421
422	static int
423	fp_setreadl(struct tok_state tok, const char enc)
424	{
425	PyObject reader, stream, *readline;
426
427	/* XXX: constify filename argument. */
428	stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);
429	if (stream == NULL)
430	return 0;
431
432	reader = PyCodec_StreamReader(enc, stream, NULL);
433	Py_DECREF(stream);
434	if (reader == NULL)
435	return 0;
436
437	readline = PyObject_GetAttrString(reader, "readline");
438	Py_DECREF(reader);
439	if (readline == NULL)
440	return 0;
441
442	tok->decoding_readline = readline;
443	return 1;
444	}
445
446	/* Fetch the next byte from TOK. */
447
448	static int fp_getc(struct tok_state *tok) {
449	return getc(tok->fp);
450	}
451
452	/* Unfetch the last byte back into TOK. */
453
454	static void fp_ungetc(int c, struct tok_state *tok) {
455	ungetc(c, tok->fp);
456	}
457
458	/* Read a line of input from TOK. Determine encoding
459	if necessary. */
460
461	static char *
462	decoding_fgets(char s, int size, struct tok_state tok)
463	{
464	char *line = NULL;
465	int badchar = 0;
466	for (;;) {
467	if (tok->decoding_state < 0) {
468	/* We already have a codec associated with
469	this input. */
470	line = fp_readl(s, size, tok);
471	break;
472	} else if (tok->decoding_state > 0) {
473	/* We want a 'raw' read. */
474	line = Py_UniversalNewlineFgets(s, size,
475	tok->fp, NULL);
476	break;
477	} else {
478	/* We have not yet determined the encoding.
479	If an encoding is found, use the file-pointer
480	reader functions from now on. */
481	if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
482	return error_ret(tok);
483	assert(tok->decoding_state != 0);
484	}
485	}
486	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
487	if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
488	return error_ret(tok);
489	}
490	}
491	#ifndef PGEN
492	/* The default encoding is ASCII, so make sure we don't have any
493	non-ASCII bytes in it. */
494	if (line && !tok->encoding) {
495	unsigned char *c;
496	for (c = (unsigned char )line; c; c++)
497	if (*c > 127) {
498	badchar = *c;
499	break;
500	}
501	}
502	if (badchar) {
503	char buf[500];
504	/* Need to add 1 to the line number, since this line
505	has not been counted, yet. */
506	sprintf(buf,
507	"Non-ASCII character '\\x%.2x' "
508	"in file %.200s on line %i, "
509	"but no encoding declared; "
510	"see http://www.python.org/peps/pep-0263.html for details",
511	badchar, tok->filename, tok->lineno + 1);
512	PyErr_SetString(PyExc_SyntaxError, buf);
513	return error_ret(tok);
514	}
515	#endif
516	return line;
517	}
518
519	static int
520	decoding_feof(struct tok_state *tok)
521	{
522	if (tok->decoding_state >= 0) {
523	return feof(tok->fp);
524	} else {
525	PyObject* buf = tok->decoding_buffer;
526	if (buf == NULL) {
527	buf = PyObject_CallObject(tok->decoding_readline, NULL);
528	if (buf == NULL) {
529	error_ret(tok);
530	return 1;
531	} else {
532	tok->decoding_buffer = buf;
533	}
534	}
535	return PyObject_Length(buf) == 0;
536	}
537	}
538
539	/* Fetch a byte from TOK, using the string buffer. */
540
541	static int
542	buf_getc(struct tok_state *tok) {
543	return Py_CHARMASK(*tok->str++);
544	}
545
546	/* Unfetch a byte from TOK, using the string buffer. */
547
548	static void
549	buf_ungetc(int c, struct tok_state *tok) {
550	tok->str--;
551	assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */
552	}
553
554	/* Set the readline function for TOK to ENC. For the string-based
555	tokenizer, this means to just record the encoding. */
556
557	static int
558	buf_setreadl(struct tok_state tok, const char enc) {
559	tok->enc = enc;
560	return 1;
561	}
562
563	/* Return a UTF-8 encoding Python string object from the
564	C byte string STR, which is encoded with ENC. */
565
566	#ifdef Py_USING_UNICODE
567	static PyObject *
568	translate_into_utf8(const char* str, const char* enc) {
569	PyObject *utf8;
570	PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
571	if (buf == NULL)
572	return NULL;
573	utf8 = PyUnicode_AsUTF8String(buf);
574	Py_DECREF(buf);
575	return utf8;
576	}
577	#endif
578
579	/* Decode a byte string STR for use as the buffer of TOK.
580	Look for encoding declarations inside STR, and record them
581	inside TOK. */
582
583	static const char *
584	decode_str(const char str, struct tok_state tok)
585	{
586	PyObject* utf8 = NULL;
587	const char *s;
588	int lineno = 0;
589	tok->enc = NULL;
590	tok->str = str;
591	if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
592	return error_ret(tok);
593	str = tok->str; /* string after BOM if any */
594	assert(str);
595	#ifdef Py_USING_UNICODE
596	if (tok->enc != NULL) {
597	utf8 = translate_into_utf8(str, tok->enc);
598	if (utf8 == NULL)
599	return error_ret(tok);
600	str = PyString_AsString(utf8);
601	}
602	#endif
603	for (s = str;; s++) {
604	if (*s == '\0') break;
605	else if (*s == '\n') {
606	lineno++;
607	if (lineno == 2) break;
608	}
609	}
610	tok->enc = NULL;
611	if (!check_coding_spec(str, s - str, tok, buf_setreadl))
612	return error_ret(tok);
613	#ifdef Py_USING_UNICODE
614	if (tok->enc != NULL) {
615	assert(utf8 == NULL);
616	utf8 = translate_into_utf8(str, tok->enc);
617	if (utf8 == NULL) {
618	PyErr_Format(PyExc_SyntaxError,
619	"unknown encoding: %s", tok->enc);
620	return error_ret(tok);
621	}
622	str = PyString_AsString(utf8);
623	}
624	#endif
625	assert(tok->decoding_buffer == NULL);
626	tok->decoding_buffer = utf8; /* CAUTION */
627	return str;
628	}
629
630	#endif /* PGEN */
631
632	/* Set up tokenizer for string */
633
634	struct tok_state *
635	PyTokenizer_FromString(const char *str)
636	{
637	struct tok_state *tok = tok_new();
638	if (tok == NULL)
639	return NULL;
640	str = (char *)decode_str(str, tok);
641	if (str == NULL) {
642	PyTokenizer_Free(tok);
643	return NULL;
644	}
645
646	/* XXX: constify members. */
647	tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
648	return tok;
649	}
650
651
652	/* Set up tokenizer for file */
653
654	struct tok_state *
655	PyTokenizer_FromFile(FILE fp, char ps1, char *ps2)
656	{
657	struct tok_state *tok = tok_new();
658	if (tok == NULL)
659	return NULL;
660	if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
661	PyTokenizer_Free(tok);
662	return NULL;
663	}
664	tok->cur = tok->inp = tok->buf;
665	tok->end = tok->buf + BUFSIZ;
666	tok->fp = fp;
667	tok->prompt = ps1;
668	tok->nextprompt = ps2;
669	return tok;
670	}
671
672
673	/* Free a tok_state structure */
674
675	void
676	PyTokenizer_Free(struct tok_state *tok)
677	{
678	if (tok->encoding != NULL)
679	PyMem_FREE(tok->encoding);
680	#ifndef PGEN
681	Py_XDECREF(tok->decoding_readline);
682	Py_XDECREF(tok->decoding_buffer);
683	#endif
684	if (tok->fp != NULL && tok->buf != NULL)
685	PyMem_FREE(tok->buf);
686	PyMem_FREE(tok);
687	}
688
689	#if !defined(PGEN) && defined(Py_USING_UNICODE)
690	static int
691	tok_stdin_decode(struct tok_state tok, char *inp)
692	{
693	PyObject enc, sysstdin, decoded, utf8;
694	const char *encoding;
695	char *converted;
696
697	if (PySys_GetFile((char *)"stdin", NULL) != stdin)
698	return 0;
699	sysstdin = PySys_GetObject("stdin");
700	if (sysstdin == NULL \|\| !PyFile_Check(sysstdin))
701	return 0;
702
703	enc = ((PyFileObject *)sysstdin)->f_encoding;
704	if (enc == NULL \|\| !PyString_Check(enc))
705	return 0;
706	Py_INCREF(enc);
707
708	encoding = PyString_AsString(enc);
709	decoded = PyUnicode_Decode(inp, strlen(inp), encoding, NULL);
710	if (decoded == NULL)
711	goto error_clear;
712
713	utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);
714	Py_DECREF(decoded);
715	if (utf8 == NULL)
716	goto error_clear;
717
718	assert(PyString_Check(utf8));
719	converted = new_string(PyString_AS_STRING(utf8),
720	PyString_GET_SIZE(utf8));
721	Py_DECREF(utf8);
722	if (converted == NULL)
723	goto error_nomem;
724
725	PyMem_FREE(*inp);
726	*inp = converted;
727	if (tok->encoding != NULL)
728	PyMem_FREE(tok->encoding);
729	tok->encoding = new_string(encoding, strlen(encoding));
730	if (tok->encoding == NULL)
731	goto error_nomem;
732
733	Py_DECREF(enc);
734	return 0;
735
736	error_nomem:
737	Py_DECREF(enc);
738	tok->done = E_NOMEM;
739	return -1;
740
741	error_clear:
742	/* Fallback to iso-8859-1: for backward compatibility */
743	Py_DECREF(enc);
744	PyErr_Clear();
745	return 0;
746	}
747	#endif
748
749	/* Get next char, updating state; error code goes into tok->done */
750
751	static int
752	tok_nextc(register struct tok_state *tok)
753	{
754	for (;;) {
755	if (tok->cur != tok->inp) {
756	return Py_CHARMASK(tok->cur++); / Fast path */
757	}
758	if (tok->done != E_OK)
759	return EOF;
760	if (tok->fp == NULL) {
761	char *end = strchr(tok->inp, '\n');
762	if (end != NULL)
763	end++;
764	else {
765	end = strchr(tok->inp, '\0');
766	if (end == tok->inp) {
767	tok->done = E_EOF;
768	return EOF;
769	}
770	}
771	if (tok->start == NULL)
772	tok->buf = tok->cur;
773	tok->line_start = tok->cur;
774	tok->lineno++;
775	tok->inp = end;
776	return Py_CHARMASK(*tok->cur++);
777	}
778	if (tok->prompt != NULL) {
779	char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
780	if (tok->nextprompt != NULL)
781	tok->prompt = tok->nextprompt;
782	if (newtok == NULL)
783	tok->done = E_INTR;
784	else if (*newtok == '\0') {
785	PyMem_FREE(newtok);
786	tok->done = E_EOF;
787	}
788	#if !defined(PGEN) && defined(Py_USING_UNICODE)
789	else if (tok_stdin_decode(tok, &newtok) != 0)
790	PyMem_FREE(newtok);
791	#endif
792	else if (tok->start != NULL) {
793	size_t start = tok->start - tok->buf;
794	size_t oldlen = tok->cur - tok->buf;
795	size_t newlen = oldlen + strlen(newtok);
796	char *buf = tok->buf;
797	buf = (char *)PyMem_REALLOC(buf, newlen+1);
798	tok->lineno++;
799	if (buf == NULL) {
800	PyMem_FREE(tok->buf);
801	tok->buf = NULL;
802	PyMem_FREE(newtok);
803	tok->done = E_NOMEM;
804	return EOF;
805	}
806	tok->buf = buf;
807	tok->cur = tok->buf + oldlen;
808	tok->line_start = tok->cur;
809	strcpy(tok->buf + oldlen, newtok);
810	PyMem_FREE(newtok);
811	tok->inp = tok->buf + newlen;
812	tok->end = tok->inp + 1;
813	tok->start = tok->buf + start;
814	}
815	else {
816	tok->lineno++;
817	if (tok->buf != NULL)
818	PyMem_FREE(tok->buf);
819	tok->buf = newtok;
820	tok->line_start = tok->buf;
821	tok->cur = tok->buf;
822	tok->line_start = tok->buf;
823	tok->inp = strchr(tok->buf, '\0');
824	tok->end = tok->inp + 1;
825	}
826	}
827	else {
828	int done = 0;
829	Py_ssize_t cur = 0;
830	char *pt;
831	if (tok->start == NULL) {
832	if (tok->buf == NULL) {
833	tok->buf = (char *)
834	PyMem_MALLOC(BUFSIZ);
835	if (tok->buf == NULL) {
836	tok->done = E_NOMEM;
837	return EOF;
838	}
839	tok->end = tok->buf + BUFSIZ;
840	}
841	if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
842	tok) == NULL) {
843	tok->done = E_EOF;
844	done = 1;
845	}
846	else {
847	tok->done = E_OK;
848	tok->inp = strchr(tok->buf, '\0');
849	done = tok->inp[-1] == '\n';
850	}
851	}
852	else {
853	cur = tok->cur - tok->buf;
854	if (decoding_feof(tok)) {
855	tok->done = E_EOF;
856	done = 1;
857	}
858	else
859	tok->done = E_OK;
860	}
861	tok->lineno++;
862	/* Read until '\n' or EOF */
863	while (!done) {
864	Py_ssize_t curstart = tok->start == NULL ? -1 :
865	tok->start - tok->buf;
866	Py_ssize_t curvalid = tok->inp - tok->buf;
867	Py_ssize_t newsize = curvalid + BUFSIZ;
868	char *newbuf = tok->buf;
869	newbuf = (char *)PyMem_REALLOC(newbuf,
870	newsize);
871	if (newbuf == NULL) {
872	tok->done = E_NOMEM;
873	tok->cur = tok->inp;
874	return EOF;
875	}
876	tok->buf = newbuf;
877	tok->inp = tok->buf + curvalid;
878	tok->end = tok->buf + newsize;
879	tok->start = curstart < 0 ? NULL :
880	tok->buf + curstart;
881	if (decoding_fgets(tok->inp,
882	(int)(tok->end - tok->inp),
883	tok) == NULL) {
884	/* Break out early on decoding
885	errors, as tok->buf will be NULL
886	*/
887	if (tok->decoding_erred)
888	return EOF;
889	/* Last line does not end in \n,
890	fake one */
891	strcpy(tok->inp, "\n");
892	}
893	tok->inp = strchr(tok->inp, '\0');
894	done = tok->inp[-1] == '\n';
895	}
896	if (tok->buf != NULL) {
897	tok->cur = tok->buf + cur;
898	tok->line_start = tok->cur;
899	/* replace "\r\n" with "\n" */
900	/* For Mac leave the \r, giving syntax error */
901	pt = tok->inp - 2;
902	if (pt >= tok->buf && *pt == '\r') {
903	*pt++ = '\n';
904	*pt = '\0';
905	tok->inp = pt;
906	}
907	}
908	}
909	if (tok->done != E_OK) {
910	if (tok->prompt != NULL)
911	PySys_WriteStderr("\n");
912	tok->cur = tok->inp;
913	return EOF;
914	}
915	}
916	/NOTREACHED/
917	}
918
919
920	/* Back-up one character */
921
922	static void
923	tok_backup(register struct tok_state *tok, register int c)
924	{
925	if (c != EOF) {
926	if (--tok->cur < tok->buf)
927	Py_FatalError("tok_backup: begin of buffer");
928	if (*tok->cur != c)
929	*tok->cur = c;
930	}
931	}
932
933
934	/* Return the token corresponding to a single character */
935
936	int
937	PyToken_OneChar(int c)
938	{
939	switch (c) {
940	case '(': return LPAR;
941	case ')': return RPAR;
942	case '[': return LSQB;
943	case ']': return RSQB;
944	case ':': return COLON;
945	case ',': return COMMA;
946	case ';': return SEMI;
947	case '+': return PLUS;
948	case '-': return MINUS;
949	case '*': return STAR;
950	case '/': return SLASH;
951	case '\|': return VBAR;
952	case '&': return AMPER;
953	case '<': return LESS;
954	case '>': return GREATER;
955	case '=': return EQUAL;
956	case '.': return DOT;
957	case '%': return PERCENT;
958	case '`': return BACKQUOTE;
959	case '{': return LBRACE;
960	case '}': return RBRACE;
961	case '^': return CIRCUMFLEX;
962	case '~': return TILDE;
963	case '@': return AT;
964	default: return OP;
965	}
966	}
967
968
969	int
970	PyToken_TwoChars(int c1, int c2)
971	{
972	switch (c1) {
973	case '=':
974	switch (c2) {
975	case '=': return EQEQUAL;
976	}
977	break;
978	case '!':
979	switch (c2) {
980	case '=': return NOTEQUAL;
981	}
982	break;
983	case '<':
984	switch (c2) {
985	case '>': return NOTEQUAL;
986	case '=': return LESSEQUAL;
987	case '<': return LEFTSHIFT;
988	}
989	break;
990	case '>':
991	switch (c2) {
992	case '=': return GREATEREQUAL;
993	case '>': return RIGHTSHIFT;
994	}
995	break;
996	case '+':
997	switch (c2) {
998	case '=': return PLUSEQUAL;
999	}
1000	break;
1001	case '-':
1002	switch (c2) {
1003	case '=': return MINEQUAL;
1004	}
1005	break;
1006	case '*':
1007	switch (c2) {
1008	case '*': return DOUBLESTAR;
1009	case '=': return STAREQUAL;
1010	}
1011	break;
1012	case '/':
1013	switch (c2) {
1014	case '/': return DOUBLESLASH;
1015	case '=': return SLASHEQUAL;
1016	}
1017	break;
1018	case '\|':
1019	switch (c2) {
1020	case '=': return VBAREQUAL;
1021	}
1022	break;
1023	case '%':
1024	switch (c2) {
1025	case '=': return PERCENTEQUAL;
1026	}
1027	break;
1028	case '&':
1029	switch (c2) {
1030	case '=': return AMPEREQUAL;
1031	}
1032	break;
1033	case '^':
1034	switch (c2) {
1035	case '=': return CIRCUMFLEXEQUAL;
1036	}
1037	break;
1038	}
1039	return OP;
1040	}
1041
1042	int
1043	PyToken_ThreeChars(int c1, int c2, int c3)
1044	{
1045	switch (c1) {
1046	case '<':
1047	switch (c2) {
1048	case '<':
1049	switch (c3) {
1050	case '=':
1051	return LEFTSHIFTEQUAL;
1052	}
1053	break;
1054	}
1055	break;
1056	case '>':
1057	switch (c2) {
1058	case '>':
1059	switch (c3) {
1060	case '=':
1061	return RIGHTSHIFTEQUAL;
1062	}
1063	break;
1064	}
1065	break;
1066	case '*':
1067	switch (c2) {
1068	case '*':
1069	switch (c3) {
1070	case '=':
1071	return DOUBLESTAREQUAL;
1072	}
1073	break;
1074	}
1075	break;
1076	case '/':
1077	switch (c2) {
1078	case '/':
1079	switch (c3) {
1080	case '=':
1081	return DOUBLESLASHEQUAL;
1082	}
1083	break;
1084	}
1085	break;
1086	}
1087	return OP;
1088	}
1089
1090	static int
1091	indenterror(struct tok_state *tok)
1092	{
1093	if (tok->alterror) {
1094	tok->done = E_TABSPACE;
1095	tok->cur = tok->inp;
1096	return 1;
1097	}
1098	if (tok->altwarning) {
1099	PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1100	"in indentation\n", tok->filename);
1101	tok->altwarning = 0;
1102	}
1103	return 0;
1104	}
1105
1106
1107	/* Get next token, after space stripping etc. */
1108
1109	static int
1110	tok_get(register struct tok_state tok, char p_start, char *p_end)
1111	{
1112	register int c;
1113	int blankline;
1114
1115	p_start = p_end = NULL;
1116	nextline:
1117	tok->start = NULL;
1118	blankline = 0;
1119
1120	/* Get indentation level */
1121	if (tok->atbol) {
1122	register int col = 0;
1123	register int altcol = 0;
1124	tok->atbol = 0;
1125	for (;;) {
1126	c = tok_nextc(tok);
1127	if (c == ' ')
1128	col++, altcol++;
1129	else if (c == '\t') {
1130	col = (col/tok->tabsize + 1) * tok->tabsize;
1131	altcol = (altcol/tok->alttabsize + 1)
1132	* tok->alttabsize;
1133	}
1134	else if (c == '\014') /* Control-L (formfeed) */
1135	col = altcol = 0; /* For Emacs users */
1136	else
1137	break;
1138	}
1139	tok_backup(tok, c);
1140	if (c == '#' \|\| c == '\n') {
1141	/* Lines with only whitespace and/or comments
1142	shouldn't affect the indentation and are
1143	not passed to the parser as NEWLINE tokens,
1144	except totally empty lines in interactive
1145	mode, which signal the end of a command group. */
1146	if (col == 0 && c == '\n' && tok->prompt != NULL)
1147	blankline = 0; /* Let it through */
1148	else
1149	blankline = 1; /* Ignore completely */
1150	/* We can't jump back right here since we still
1151	may need to skip to the end of a comment */
1152	}
1153	if (!blankline && tok->level == 0) {
1154	if (col == tok->indstack[tok->indent]) {
1155	/* No change */
1156	if (altcol != tok->altindstack[tok->indent]) {
1157	if (indenterror(tok))
1158	return ERRORTOKEN;
1159	}
1160	}
1161	else if (col > tok->indstack[tok->indent]) {
1162	/* Indent -- always one */
1163	if (tok->indent+1 >= MAXINDENT) {
1164	tok->done = E_TOODEEP;
1165	tok->cur = tok->inp;
1166	return ERRORTOKEN;
1167	}
1168	if (altcol <= tok->altindstack[tok->indent]) {
1169	if (indenterror(tok))
1170	return ERRORTOKEN;
1171	}
1172	tok->pendin++;
1173	tok->indstack[++tok->indent] = col;
1174	tok->altindstack[tok->indent] = altcol;
1175	}
1176	else /* col < tok->indstack[tok->indent] */ {
1177	/* Dedent -- any number, must be consistent */
1178	while (tok->indent > 0 &&
1179	col < tok->indstack[tok->indent]) {
1180	tok->pendin--;
1181	tok->indent--;
1182	}
1183	if (col != tok->indstack[tok->indent]) {
1184	tok->done = E_DEDENT;
1185	tok->cur = tok->inp;
1186	return ERRORTOKEN;
1187	}
1188	if (altcol != tok->altindstack[tok->indent]) {
1189	if (indenterror(tok))
1190	return ERRORTOKEN;
1191	}
1192	}
1193	}
1194	}
1195
1196	tok->start = tok->cur;
1197
1198	/* Return pending indents/dedents */
1199	if (tok->pendin != 0) {
1200	if (tok->pendin < 0) {
1201	tok->pendin++;
1202	return DEDENT;
1203	}
1204	else {
1205	tok->pendin--;
1206	return INDENT;
1207	}
1208	}
1209
1210	again:
1211	tok->start = NULL;
1212	/* Skip spaces */
1213	do {
1214	c = tok_nextc(tok);
1215	} while (c == ' ' \|\| c == '\t' \|\| c == '\014');
1216
1217	/* Set start of current token */
1218	tok->start = tok->cur - 1;
1219
1220	/* Skip comment, while looking for tab-setting magic */
1221	if (c == '#') {
1222	static char *tabforms[] = {
1223	"tab-width:", /* Emacs */
1224	":tabstop=", /* vim, full form */
1225	":ts=", /* vim, abbreviated form */
1226	"set tabsize=", /* will vi never die? */
1227	/* more templates can be added here to support other editors */
1228	};
1229	char cbuf[80];
1230	char tp, *cp;
1231	tp = cbuf;
1232	do {
1233	*tp++ = c = tok_nextc(tok);
1234	} while (c != EOF && c != '\n' &&
1235	(size_t)(tp - cbuf + 1) < sizeof(cbuf));
1236	*tp = '\0';
1237	for (cp = tabforms;
1238	cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);
1239	cp++) {
1240	if ((tp = strstr(cbuf, *cp))) {
1241	int newsize = atoi(tp + strlen(*cp));
1242
1243	if (newsize >= 1 && newsize <= 40) {
1244	tok->tabsize = newsize;
1245	if (Py_VerboseFlag)
1246	PySys_WriteStderr(
1247	"Tab size set to %d\n",
1248	newsize);
1249	}
1250	}
1251	}
1252	while (c != EOF && c != '\n')
1253	c = tok_nextc(tok);
1254	}
1255
1256	/* Check for EOF and errors now */
1257	if (c == EOF) {
1258	return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1259	}
1260
1261	/* Identifier (most frequent token!) */
1262	if (isalpha(c) \|\| c == '_') {
1263	/* Process r"", u"" and ur"" */
1264	switch (c) {
1265	case 'r':
1266	case 'R':
1267	c = tok_nextc(tok);
1268	if (c == '"' \|\| c == '\'')
1269	goto letter_quote;
1270	break;
1271	case 'u':
1272	case 'U':
1273	c = tok_nextc(tok);
1274	if (c == 'r' \|\| c == 'R')
1275	c = tok_nextc(tok);
1276	if (c == '"' \|\| c == '\'')
1277	goto letter_quote;
1278	break;
1279	}
1280	while (isalnum(c) \|\| c == '_') {
1281	c = tok_nextc(tok);
1282	}
1283	tok_backup(tok, c);
1284	*p_start = tok->start;
1285	*p_end = tok->cur;
1286	return NAME;
1287	}
1288
1289	/* Newline */
1290	if (c == '\n') {
1291	tok->atbol = 1;
1292	if (blankline \|\| tok->level > 0)
1293	goto nextline;
1294	*p_start = tok->start;
1295	p_end = tok->cur - 1; / Leave '\n' out of the string */
1296	tok->cont_line = 0;
1297	return NEWLINE;
1298	}
1299
1300	/* Period or number starting with period? */
1301	if (c == '.') {
1302	c = tok_nextc(tok);
1303	if (isdigit(c)) {
1304	goto fraction;
1305	}
1306	else {
1307	tok_backup(tok, c);
1308	*p_start = tok->start;
1309	*p_end = tok->cur;
1310	return DOT;
1311	}
1312	}
1313
1314	/* Number */
1315	if (isdigit(c)) {
1316	if (c == '0') {
1317	/* Hex or octal -- maybe. */
1318	c = tok_nextc(tok);
1319	if (c == '.')
1320	goto fraction;
1321	#ifndef WITHOUT_COMPLEX
1322	if (c == 'j' \|\| c == 'J')
1323	goto imaginary;
1324	#endif
1325	if (c == 'x' \|\| c == 'X') {
1326	/* Hex */
1327	do {
1328	c = tok_nextc(tok);
1329	} while (isxdigit(c));
1330	}
1331	else {
1332	int found_decimal = 0;
1333	/* Octal; c is first char of it */
1334	/* There's no 'isoctdigit' macro, sigh */
1335	while ('0' <= c && c < '8') {
1336	c = tok_nextc(tok);
1337	}
1338	if (isdigit(c)) {
1339	found_decimal = 1;
1340	do {
1341	c = tok_nextc(tok);
1342	} while (isdigit(c));
1343	}
1344	if (c == '.')
1345	goto fraction;
1346	else if (c == 'e' \|\| c == 'E')
1347	goto exponent;
1348	#ifndef WITHOUT_COMPLEX
1349	else if (c == 'j' \|\| c == 'J')
1350	goto imaginary;
1351	#endif
1352	else if (found_decimal) {
1353	tok->done = E_TOKEN;
1354	tok_backup(tok, c);
1355	return ERRORTOKEN;
1356	}
1357	}
1358	if (c == 'l' \|\| c == 'L')
1359	c = tok_nextc(tok);
1360	}
1361	else {
1362	/* Decimal */
1363	do {
1364	c = tok_nextc(tok);
1365	} while (isdigit(c));
1366	if (c == 'l' \|\| c == 'L')
1367	c = tok_nextc(tok);
1368	else {
1369	/* Accept floating point numbers. */
1370	if (c == '.') {
1371	fraction:
1372	/* Fraction */
1373	do {
1374	c = tok_nextc(tok);
1375	} while (isdigit(c));
1376	}
1377	if (c == 'e' \|\| c == 'E') {
1378	exponent:
1379	/* Exponent part */
1380	c = tok_nextc(tok);
1381	if (c == '+' \|\| c == '-')
1382	c = tok_nextc(tok);
1383	if (!isdigit(c)) {
1384	tok->done = E_TOKEN;
1385	tok_backup(tok, c);
1386	return ERRORTOKEN;
1387	}
1388	do {
1389	c = tok_nextc(tok);
1390	} while (isdigit(c));
1391	}
1392	#ifndef WITHOUT_COMPLEX
1393	if (c == 'j' \|\| c == 'J')
1394	/* Imaginary part */
1395	imaginary:
1396	c = tok_nextc(tok);
1397	#endif
1398	}
1399	}
1400	tok_backup(tok, c);
1401	*p_start = tok->start;
1402	*p_end = tok->cur;
1403	return NUMBER;
1404	}
1405
1406	letter_quote:
1407	/* String */
1408	if (c == '\'' \|\| c == '"') {
1409	Py_ssize_t quote2 = tok->cur - tok->start + 1;
1410	int quote = c;
1411	int triple = 0;
1412	int tripcount = 0;
1413	for (;;) {
1414	c = tok_nextc(tok);
1415	if (c == '\n') {
1416	if (!triple) {
1417	tok->done = E_EOLS;
1418	tok_backup(tok, c);
1419	return ERRORTOKEN;
1420	}
1421	tripcount = 0;
1422	tok->cont_line = 1; /* multiline string. */
1423	}
1424	else if (c == EOF) {
1425	if (triple)
1426	tok->done = E_EOFS;
1427	else
1428	tok->done = E_EOLS;
1429	tok->cur = tok->inp;
1430	return ERRORTOKEN;
1431	}
1432	else if (c == quote) {
1433	tripcount++;
1434	if (tok->cur - tok->start == quote2) {
1435	c = tok_nextc(tok);
1436	if (c == quote) {
1437	triple = 1;
1438	tripcount = 0;
1439	continue;
1440	}
1441	tok_backup(tok, c);
1442	}
1443	if (!triple \|\| tripcount == 3)
1444	break;
1445	}
1446	else if (c == '\\') {
1447	tripcount = 0;
1448	c = tok_nextc(tok);
1449	if (c == EOF) {
1450	tok->done = E_EOLS;
1451	tok->cur = tok->inp;
1452	return ERRORTOKEN;
1453	}
1454	}
1455	else
1456	tripcount = 0;
1457	}
1458	*p_start = tok->start;
1459	*p_end = tok->cur;
1460	return STRING;
1461	}
1462
1463	/* Line continuation */
1464	if (c == '\\') {
1465	c = tok_nextc(tok);
1466	if (c != '\n') {
1467	tok->done = E_LINECONT;
1468	tok->cur = tok->inp;
1469	return ERRORTOKEN;
1470	}
1471	tok->cont_line = 1;
1472	goto again; /* Read next line */
1473	}
1474
1475	/* Check for two-character token */
1476	{
1477	int c2 = tok_nextc(tok);
1478	int token = PyToken_TwoChars(c, c2);
1479	if (token != OP) {
1480	int c3 = tok_nextc(tok);
1481	int token3 = PyToken_ThreeChars(c, c2, c3);
1482	if (token3 != OP) {
1483	token = token3;
1484	} else {
1485	tok_backup(tok, c3);
1486	}
1487	*p_start = tok->start;
1488	*p_end = tok->cur;
1489	return token;
1490	}
1491	tok_backup(tok, c2);
1492	}
1493
1494	/* Keep track of parentheses nesting level */
1495	switch (c) {
1496	case '(':
1497	case '[':
1498	case '{':
1499	tok->level++;
1500	break;
1501	case ')':
1502	case ']':
1503	case '}':
1504	tok->level--;
1505	break;
1506	}
1507
1508	/* Punctuation character */
1509	*p_start = tok->start;
1510	*p_end = tok->cur;
1511	return PyToken_OneChar(c);
1512	}
1513
1514	int
1515	PyTokenizer_Get(struct tok_state tok, char p_start, char *p_end)
1516	{
1517	int result = tok_get(tok, p_start, p_end);
1518	if (tok->decoding_erred) {
1519	result = ERRORTOKEN;
1520	tok->done = E_DECODE;
1521	}
1522	return result;
1523	}
1524
1525	#ifdef Py_DEBUG
1526
1527	void
1528	tok_dump(int type, char start, char end)
1529	{
1530	printf("%s", _PyParser_TokenNames[type]);
1531	if (type == NAME \|\| type == NUMBER \|\| type == STRING \|\| type == OP)
1532	printf("(%.*s)", (int)(end - start), start);
1533	}
1534
1535	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Parser/tokenizer.c@ 3506

Download in other formats: