clang
20.0.0git
include
clang
Tooling
Syntax
Tokens.h
Go to the documentation of this file.
1
//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
// Record tokens that a preprocessor emits and define operations to map between
9
// the tokens written in a file and tokens produced by the preprocessor.
10
//
11
// When running the compiler, there are two token streams we are interested in:
12
// - "spelled" tokens directly correspond to a substring written in some
13
// source file.
14
// - "expanded" tokens represent the result of preprocessing, parses consumes
15
// this token stream to produce the AST.
16
//
17
// Expanded tokens correspond directly to locations found in the AST, allowing
18
// to find subranges of the token stream covered by various AST nodes. Spelled
19
// tokens correspond directly to the source code written by the user.
20
//
21
// To allow composing these two use-cases, we also define operations that map
22
// between expanded and spelled tokens that produced them (macro calls,
23
// directives, etc).
24
//
25
//===----------------------------------------------------------------------===//
26
27
#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
28
#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
29
30
#include "
clang/Basic/LangOptions.h
"
31
#include "
clang/Basic/SourceLocation.h
"
32
#include "
clang/Basic/SourceManager.h
"
33
#include "
clang/Basic/TokenKinds.h
"
34
#include "
clang/Lex/Token.h
"
35
#include "llvm/ADT/ArrayRef.h"
36
#include "llvm/ADT/DenseMap.h"
37
#include "llvm/ADT/StringRef.h"
38
#include "llvm/Support/Compiler.h"
39
#include "llvm/Support/raw_ostream.h"
40
#include <cstdint>
41
#include <tuple>
42
43
namespace
clang
{
44
class
Preprocessor;
45
46
namespace
syntax {
47
48
/// A half-open character range inside a particular file, the start offset is
49
/// included and the end offset is excluded from the range.
50
struct
FileRange
{
51
/// EXPECTS: File.isValid() && Begin <= End.
52
FileRange
(
FileID
File,
unsigned
BeginOffset,
unsigned
EndOffset);
53
/// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
54
FileRange
(
const
SourceManager
&
SM
,
SourceLocation
BeginLoc,
unsigned
Length);
55
/// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
56
/// are the same.
57
FileRange
(
const
SourceManager
&
SM
,
SourceLocation
BeginLoc,
58
SourceLocation
EndLoc);
59
60
FileID
file
()
const
{
return
File
; }
61
/// Start is a start offset (inclusive) in the corresponding file.
62
unsigned
beginOffset
()
const
{
return
Begin; }
63
/// End offset (exclusive) in the corresponding file.
64
unsigned
endOffset
()
const
{
return
End; }
65
66
unsigned
length
()
const
{
return
End - Begin; }
67
68
/// Check if \p Offset is inside the range.
69
bool
contains
(
unsigned
Offset)
const
{
70
return
Begin <= Offset && Offset < End;
71
}
72
/// Check \p Offset is inside the range or equal to its endpoint.
73
bool
touches
(
unsigned
Offset)
const
{
74
return
Begin <= Offset && Offset <= End;
75
}
76
77
/// Gets the substring that this FileRange refers to.
78
llvm::StringRef
text
(
const
SourceManager
&
SM
)
const
;
79
80
/// Convert to the clang range. The returned range is always a char range,
81
/// never a token range.
82
CharSourceRange
toCharRange
(
const
SourceManager
&
SM
)
const
;
83
84
friend
bool
operator==
(
const
FileRange
&L,
const
FileRange
&R) {
85
return
std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
86
}
87
friend
bool
operator!=
(
const
FileRange
&L,
const
FileRange
&R) {
88
return
!(L == R);
89
}
90
91
private
:
92
FileID
File
;
93
unsigned
Begin;
94
unsigned
End;
95
};
96
97
/// For debugging purposes.
98
llvm::raw_ostream &
operator<<
(llvm::raw_ostream &OS,
const
FileRange &R);
99
100
/// A token coming directly from a file or from a macro invocation. Has just
101
/// enough information to locate the token in the source code.
102
/// Can represent both expanded and spelled tokens.
103
class
Token
{
104
public
:
105
Token
(
SourceLocation
Location,
unsigned
Length,
tok::TokenKind
Kind);
106
/// EXPECTS: clang::Token is not an annotation token.
107
explicit
Token
(
const
clang::Token
&
T
);
108
109
tok::TokenKind
kind
()
const
{
return
Kind; }
110
/// Location of the first character of a token.
111
SourceLocation
location
()
const
{
return
Location; }
112
/// Location right after the last character of a token.
113
SourceLocation
endLocation
()
const
{
114
return
Location.
getLocWithOffset
(Length);
115
}
116
unsigned
length
()
const
{
return
Length; }
117
118
/// Get the substring covered by the token. Note that will include all
119
/// digraphs, newline continuations, etc. E.g. tokens for 'int' and
120
/// in\
121
/// t
122
/// both have the same kind tok::kw_int, but results of text() are different.
123
llvm::StringRef
text
(
const
SourceManager
&
SM
)
const
;
124
125
/// Gets a range of this token.
126
/// EXPECTS: token comes from a file, not from a macro expansion.
127
FileRange
range
(
const
SourceManager
&
SM
)
const
;
128
129
/// Given two tokens inside the same file, returns a file range that starts at
130
/// \p First and ends at \p Last.
131
/// EXPECTS: First and Last are file tokens from the same file, Last starts
132
/// after First.
133
static
FileRange
range
(
const
SourceManager
&
SM
,
const
syntax::Token
&
First
,
134
const
syntax::Token
&
Last
);
135
136
std::string
dumpForTests
(
const
SourceManager
&
SM
)
const
;
137
/// For debugging purposes.
138
std::string
str
()
const
;
139
140
private
:
141
SourceLocation
Location;
142
unsigned
Length;
143
tok::TokenKind
Kind;
144
};
145
/// For debugging purposes. Equivalent to a call to Token::str().
146
llvm::raw_ostream &
operator<<
(llvm::raw_ostream &OS,
const
Token
&
T
);
147
148
/// A list of tokens obtained by preprocessing a text buffer and operations to
149
/// map between the expanded and spelled tokens, i.e. TokenBuffer has
150
/// information about two token streams:
151
/// 1. Expanded tokens: tokens produced by the preprocessor after all macro
152
/// replacements,
153
/// 2. Spelled tokens: corresponding directly to the source code of a file
154
/// before any macro replacements occurred.
155
/// Here's an example to illustrate a difference between those two:
156
/// #define FOO 10
157
/// int a = FOO;
158
///
159
/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
160
/// Expanded tokens are {'int','a','=','10',';','eof'}.
161
///
162
/// Note that the expanded token stream has a tok::eof token at the end, the
163
/// spelled tokens never store a 'eof' token.
164
///
165
/// The full list expanded tokens can be obtained with expandedTokens(). Spelled
166
/// tokens for each of the files can be obtained via spelledTokens(FileID).
167
///
168
/// To map between the expanded and spelled tokens use findSpelledByExpanded().
169
///
170
/// To build a token buffer use the TokenCollector class. You can also compute
171
/// the spelled tokens of a file using the tokenize() helper.
172
///
173
/// FIXME: allow mappings into macro arguments.
174
class
TokenBuffer
{
175
public
:
176
TokenBuffer
(
const
SourceManager
&SourceMgr) : SourceMgr(&SourceMgr) {}
177
178
TokenBuffer
(
TokenBuffer
&&) =
default
;
179
TokenBuffer
(
const
TokenBuffer
&) =
delete
;
180
TokenBuffer
&
operator=
(
TokenBuffer
&&) =
default
;
181
TokenBuffer
&
operator=
(
const
TokenBuffer
&) =
delete
;
182
183
/// All tokens produced by the preprocessor after all macro replacements,
184
/// directives, etc. Source locations found in the clang AST will always
185
/// point to one of these tokens.
186
/// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
187
/// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
188
/// into two '>' tokens by the parser. However, TokenBuffer currently
189
/// keeps it as a single '>>' token.
190
llvm::ArrayRef<syntax::Token>
expandedTokens
()
const
{
191
return
ExpandedTokens;
192
}
193
194
/// Builds a cache to make future calls to expandedToken(SourceRange) faster.
195
/// Creates an index only once. Further calls to it will be no-op.
196
void
indexExpandedTokens
();
197
198
/// Returns the subrange of expandedTokens() corresponding to the closed
199
/// token range R.
200
/// Consider calling indexExpandedTokens() before for faster lookups.
201
llvm::ArrayRef<syntax::Token>
expandedTokens
(
SourceRange
R)
const
;
202
203
/// Returns the subrange of spelled tokens corresponding to AST node spanning
204
/// \p Expanded. This is the text that should be replaced if a refactoring
205
/// were to rewrite the node. If \p Expanded is empty, the returned value is
206
/// std::nullopt.
207
///
208
/// Will fail if the expanded tokens do not correspond to a sequence of
209
/// spelled tokens. E.g. for the following example:
210
///
211
/// #define FIRST f1 f2 f3
212
/// #define SECOND s1 s2 s3
213
/// #define ID2(X, Y) X Y
214
///
215
/// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
216
/// d ID2(e f g, h) i // expanded tokens are: d e f g h i
217
///
218
/// the results would be:
219
/// expanded => spelled
220
/// ------------------------
221
/// a => a
222
/// s1 s2 s3 => SECOND
223
/// a f1 f2 f3 => a FIRST
224
/// a f1 => can't map
225
/// s1 s2 => can't map
226
/// e f => e f
227
/// g h => can't map
228
///
229
/// EXPECTS: \p Expanded is a subrange of expandedTokens().
230
/// Complexity is logarithmic.
231
std::optional<llvm::ArrayRef<syntax::Token>>
232
spelledForExpanded
(
llvm::ArrayRef<syntax::Token>
Expanded)
const
;
233
234
/// Find the subranges of expanded tokens, corresponding to \p Spelled.
235
///
236
/// Some spelled tokens may not be present in the expanded token stream, so
237
/// this function can return an empty vector, e.g. for tokens of macro
238
/// directives or disabled preprocessor branches.
239
///
240
/// Some spelled tokens can be duplicated in the expanded token stream
241
/// multiple times and this function will return multiple results in those
242
/// cases. This happens when \p Spelled is inside a macro argument.
243
///
244
/// FIXME: return correct results on macro arguments. For now, we return an
245
/// empty list.
246
///
247
/// (!) will return empty vector on tokens from #define body:
248
/// E.g. for the following example:
249
///
250
/// #define FIRST(A) f1 A = A f2
251
/// #define SECOND s
252
///
253
/// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
254
/// The results would be
255
/// spelled => expanded
256
/// ------------------------
257
/// #define FIRST => {}
258
/// a FIRST(arg) => {a f1 arg = arg f2}
259
/// arg => {arg, arg} // arg #1 is before `=` and arg #2 is
260
/// // after `=` in the expanded tokens.
261
llvm::SmallVector<llvm::ArrayRef<syntax::Token>
, 1>
262
expandedForSpelled
(
llvm::ArrayRef<syntax::Token>
Spelled)
const
;
263
264
/// An expansion produced by the preprocessor, includes macro expansions and
265
/// preprocessor directives. Preprocessor always maps a non-empty range of
266
/// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
267
/// few examples of expansions:
268
/// #pragma once // Expands to an empty range.
269
/// #define FOO 1 2 3 // Expands an empty range.
270
/// FOO // Expands to "1 2 3".
271
/// FIXME(ibiryukov): implement this, currently #include expansions are empty.
272
/// #include <vector> // Expands to tokens produced by the include.
273
struct
Expansion
{
274
llvm::ArrayRef<syntax::Token>
Spelled
;
275
llvm::ArrayRef<syntax::Token>