Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

tokenize.py@ 3404

Visit:

Last change on this file since 3404 was 3225, checked in by bird, 19 years ago
Python 2.5
File size: 13.2 KB

Rev	Line
[3225]	1	"""Tokenization help for Python programs.
	2
	3	generate_tokens(readline) is a generator that breaks a stream of
	4	text into Python tokens. It accepts a readline-like method which is called
	5	repeatedly to get the next line of input (or "" for EOF). It generates
	6	5-tuples with these members:
	7
	8	the token type (see token.py)
	9	the token (a string)
	10	the starting (row, column) indices of the token (a 2-tuple of ints)
	11	the ending (row, column) indices of the token (a 2-tuple of ints)
	12	the original line (string)
	13
	14	It is designed to match the working of the Python tokenizer exactly, except
	15	that it produces COMMENT tokens for comments and gives type OP for all
	16	operators
	17
	18	Older entry points
	19	tokenize_loop(readline, tokeneater)
	20	tokenize(readline, tokeneater=printtoken)
	21	are the same, except instead of generating tokens, tokeneater is a callback
	22	function to which the 5 fields described above are passed as 5 arguments,
	23	each time a new token is found."""
	24
	25	__author__ = 'Ka-Ping Yee <[email protected]>'
	26	__credits__ = \
	27	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
	28
	29	import string, re
	30	from token import *
	31
	32	import token
	33	__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
	34	"generate_tokens", "NL", "untokenize"]
	35	del x
	36	del token
	37
	38	COMMENT = N_TOKENS
	39	tok_name[COMMENT] = 'COMMENT'
	40	NL = N_TOKENS + 1
	41	tok_name[NL] = 'NL'
	42	N_TOKENS += 2
	43
	44	def group(*choices): return '(' + '\|'.join(choices) + ')'
	45	def any(choices): return group(choices) + '*'
	46	def maybe(choices): return group(choices) + '?'
	47
	48	Whitespace = r'[ \f\t]*'
	49	Comment = r'#[^\r\n]*'
	50	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
	51	Name = r'[a-zA-Z_]\w*'
	52
	53	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
	54	Octnumber = r'0[0-7]*[lL]?'
	55	Decnumber = r'[1-9]\d*[lL]?'
	56	Intnumber = group(Hexnumber, Octnumber, Decnumber)
	57	Exponent = r'[eE][-+]?\d+'
	58	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
	59	Expfloat = r'\d+' + Exponent
	60	Floatnumber = group(Pointfloat, Expfloat)
	61	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
	62	Number = group(Imagnumber, Floatnumber, Intnumber)
	63
	64	# Tail end of ' string.
	65	Single = r"[^'\\](?:\\.[^'\\])*'"
	66	# Tail end of " string.
	67	Double = r'[^"\\](?:\\.[^"\\])*"'
	68	# Tail end of ''' string.
	69	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
	70	# Tail end of """ string.
	71	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
	72	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
	73	# Single-line ' or " string.
	74	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
	75	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
	76
	77	# Because of leftmost-then-longest match semantics, be sure to put the
	78	# longest operators first (e.g., if = came before ==, == would get
	79	# recognized as two instances of =).
	80	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
	81	r"//=?",
	82	r"[+\-*/%&\|^=<>]=?",
	83	r"~")
	84
	85	Bracket = '[][(){}]'
	86	Special = group(r'\r?\n', r'[:;.,`@]')
	87	Funny = group(Operator, Bracket, Special)
	88
	89	PlainToken = group(Number, Funny, String, Name)
	90	Token = Ignore + PlainToken
	91
	92	# First (or only) line of ' or " string.
	93	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
	94	group("'", r'\\\r?\n'),
	95	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
	96	group('"', r'\\\r?\n'))
	97	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
	98	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
	99
	100	tokenprog, pseudoprog, single3prog, double3prog = map(
	101	re.compile, (Token, PseudoToken, Single3, Double3))
	102	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
	103	"'''": single3prog, '"""': double3prog,
	104	"r'''": single3prog, 'r"""': double3prog,
	105	"u'''": single3prog, 'u"""': double3prog,
	106	"ur'''": single3prog, 'ur"""': double3prog,
	107	"R'''": single3prog, 'R"""': double3prog,
	108	"U'''": single3prog, 'U"""': double3prog,
	109	"uR'''": single3prog, 'uR"""': double3prog,
	110	"Ur'''": single3prog, 'Ur"""': double3prog,
	111	"UR'''": single3prog, 'UR"""': double3prog,
	112	'r': None, 'R': None, 'u': None, 'U': None}
	113
	114	triple_quoted = {}
	115	for t in ("'''", '"""',
	116	"r'''", 'r"""', "R'''", 'R"""',
	117	"u'''", 'u"""', "U'''", 'U"""',
	118	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
	119	"uR'''", 'uR"""', "UR'''", 'UR"""'):
	120	triple_quoted[t] = t
	121	single_quoted = {}
	122	for t in ("'", '"',
	123	"r'", 'r"', "R'", 'R"',
	124	"u'", 'u"', "U'", 'U"',
	125	"ur'", 'ur"', "Ur'", 'Ur"',
	126	"uR'", 'uR"', "UR'", 'UR"' ):
	127	single_quoted[t] = t
	128
	129	tabsize = 8
	130
	131	class TokenError(Exception): pass
	132
	133	class StopTokenizing(Exception): pass
	134
	135	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
	136	print "%d,%d-%d,%d:\t%s\t%s" % \
	137	(srow, scol, erow, ecol, tok_name[type], repr(token))
	138
	139	def tokenize(readline, tokeneater=printtoken):
	140	"""
	141	The tokenize() function accepts two parameters: one representing the
	142	input stream, and one providing an output mechanism for tokenize().
	143
	144	The first parameter, readline, must be a callable object which provides
	145	the same interface as the readline() method of built-in file objects.
	146	Each call to the function should return one line of input as a string.
	147
	148	The second parameter, tokeneater, must also be a callable object. It is
	149	called once for each token, with five arguments, corresponding to the
	150	tuples generated by generate_tokens().
	151	"""
	152	try:
	153	tokenize_loop(readline, tokeneater)
	154	except StopTokenizing:
	155	pass
	156
	157	# backwards compatible interface
	158	def tokenize_loop(readline, tokeneater):
	159	for token_info in generate_tokens(readline):
	160	tokeneater(*token_info)
	161
	162
	163	def untokenize(iterable):
	164	"""Transform tokens back into Python source code.
	165
	166	Each element returned by the iterable must be a token sequence
	167	with at least two elements, a token number and token value.
	168
	169	Round-trip invariant:
	170	# Output text will tokenize the back to the input
	171	t1 = [tok[:2] for tok in generate_tokens(f.readline)]
	172	newcode = untokenize(t1)
	173	readline = iter(newcode.splitlines(1)).next
	174	t2 = [tok[:2] for tokin generate_tokens(readline)]
	175	assert t1 == t2
	176	"""
	177
	178	startline = False
	179	indents = []
	180	toks = []
	181	toks_append = toks.append
	182	for tok in iterable:
	183	toknum, tokval = tok[:2]
	184
	185	if toknum in (NAME, NUMBER):
	186	tokval += ' '
	187
	188	if toknum == INDENT:
	189	indents.append(tokval)
	190	continue
	191	elif toknum == DEDENT:
	192	indents.pop()
	193	continue
	194	elif toknum in (NEWLINE, COMMENT, NL):
	195	startline = True
	196	elif startline and indents:
	197	toks_append(indents[-1])
	198	startline = False
	199	toks_append(tokval)
	200	return ''.join(toks)
	201
	202
	203	def generate_tokens(readline):
	204	"""
	205	The generate_tokens() generator requires one argment, readline, which
	206	must be a callable object which provides the same interface as the
	207	readline() method of built-in file objects. Each call to the function
	208	should return one line of input as a string. Alternately, readline
	209	can be a callable function terminating with StopIteration:
	210	readline = open(myfile).next # Example of alternate readline
	211
	212	The generator produces 5-tuples with these members: the token type; the
	213	token string; a 2-tuple (srow, scol) of ints specifying the row and
	214	column where the token begins in the source; a 2-tuple (erow, ecol) of
	215	ints specifying the row and column where the token ends in the source;
	216	and the line on which the token was found. The line passed is the
	217	logical line; continuation lines are included.
	218	"""
	219	lnum = parenlev = continued = 0
	220	namechars, numchars = string.ascii_letters + '_', '0123456789'
	221	contstr, needcont = '', 0
	222	contline = None
	223	indents = [0]
	224
	225	while 1: # loop over lines in stream
	226	try:
	227	line = readline()
	228	except StopIteration:
	229	line = ''
	230	lnum = lnum + 1
	231	pos, max = 0, len(line)
	232
	233	if contstr: # continued string
	234	if not line:
	235	raise TokenError, ("EOF in multi-line string", strstart)
	236	endmatch = endprog.match(line)
	237	if endmatch:
	238	pos = end = endmatch.end(0)
	239	yield (STRING, contstr + line[:end],
	240	strstart, (lnum, end), contline + line)
	241	contstr, needcont = '', 0
	242	contline = None
	243	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
	244	yield (ERRORTOKEN, contstr + line,
	245	strstart, (lnum, len(line)), contline)
	246	contstr = ''
	247	contline = None
	248	continue
	249	else:
	250	contstr = contstr + line
	251	contline = contline + line
	252	continue
	253
	254	elif parenlev == 0 and not continued: # new statement
	255	if not line: break
	256	column = 0
	257	while pos < max: # measure leading whitespace
	258	if line[pos] == ' ': column = column + 1
	259	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
	260	elif line[pos] == '\f': column = 0
	261	else: break
	262	pos = pos + 1
	263	if pos == max: break
	264
	265	if line[pos] in '#\r\n': # skip comments or blank lines
	266	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
	267	(lnum, pos), (lnum, len(line)), line)
	268	continue
	269
	270	if column > indents[-1]: # count indents or dedents
	271	indents.append(column)
	272	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
	273	while column < indents[-1]:
	274	if column not in indents:
	275	raise IndentationError(
	276	"unindent does not match any outer indentation level",
	277	("<tokenize>", lnum, pos, line))
	278	indents = indents[:-1]
	279	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
	280
	281	else: # continued statement
	282	if not line:
	283	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
	284	continued = 0
	285
	286	while pos < max:
	287	pseudomatch = pseudoprog.match(line, pos)
	288	if pseudomatch: # scan for tokens
	289	start, end = pseudomatch.span(1)
	290	spos, epos, pos = (lnum, start), (lnum, end), end
	291	token, initial = line[start:end], line[start]
	292
	293	if initial in numchars or \
	294	(initial == '.' and token != '.'): # ordinary number
	295	yield (NUMBER, token, spos, epos, line)
	296	elif initial in '\r\n':
	297	yield (parenlev > 0 and NL or NEWLINE,
	298	token, spos, epos, line)
	299	elif initial == '#':
	300	yield (COMMENT, token, spos, epos, line)
	301	elif token in triple_quoted:
	302	endprog = endprogs[token]
	303	endmatch = endprog.match(line, pos)
	304	if endmatch: # all on one line
	305	pos = endmatch.end(0)
	306	token = line[start:pos]
	307	yield (STRING, token, spos, (lnum, pos), line)
	308	else:
	309	strstart = (lnum, start) # multiple lines
	310	contstr = line[start:]
	311	contline = line
	312	break
	313	elif initial in single_quoted or \
	314	token[:2] in single_quoted or \
	315	token[:3] in single_quoted:
	316	if token[-1] == '\n': # continued string
	317	strstart = (lnum, start)
	318	endprog = (endprogs[initial] or endprogs[token[1]] or
	319	endprogs[token[2]])
	320	contstr, needcont = line[start:], 1
	321	contline = line
	322	break
	323	else: # ordinary string
	324	yield (STRING, token, spos, epos, line)
	325	elif initial in namechars: # ordinary name
	326	yield (NAME, token, spos, epos, line)
	327	elif initial == '\\': # continued stmt
	328	continued = 1
	329	else:
	330	if initial in '([{': parenlev = parenlev + 1
	331	elif initial in ')]}': parenlev = parenlev - 1
	332	yield (OP, token, spos, epos, line)
	333	else:
	334	yield (ERRORTOKEN, line[pos],
	335	(lnum, pos), (lnum, pos+1), line)
	336	pos = pos + 1
	337
	338	for indent in indents[1:]: # pop remaining indent levels
	339	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
	340	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
	341
	342	if __name__ == '__main__': # testing
	343	import sys
	344	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
	345	else: tokenize(sys.stdin.readline)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/python/2.5/Lib/tokenize.py@ 3404

Download in other formats: