diff options
Diffstat (limited to 'prism/parser.h')
-rw-r--r-- | prism/parser.h | 417 |
1 files changed, 417 insertions, 0 deletions
diff --git a/prism/parser.h b/prism/parser.h new file mode 100644 index 0000000000..89b0f2744b --- /dev/null +++ b/prism/parser.h @@ -0,0 +1,417 @@ +#ifndef YARP_PARSER_H +#define YARP_PARSER_H + +#include "yarp/ast.h" +#include "yarp/defines.h" +#include "yarp/enc/yp_encoding.h" +#include "yarp/util/yp_constant_pool.h" +#include "yarp/util/yp_list.h" +#include "yarp/util/yp_newline_list.h" +#include "yarp/util/yp_state_stack.h" + +#include <stdbool.h> + +// This enum provides various bits that represent different kinds of states that +// the lexer can track. This is used to determine which kind of token to return +// based on the context of the parser. +typedef enum { + YP_LEX_STATE_BIT_BEG, + YP_LEX_STATE_BIT_END, + YP_LEX_STATE_BIT_ENDARG, + YP_LEX_STATE_BIT_ENDFN, + YP_LEX_STATE_BIT_ARG, + YP_LEX_STATE_BIT_CMDARG, + YP_LEX_STATE_BIT_MID, + YP_LEX_STATE_BIT_FNAME, + YP_LEX_STATE_BIT_DOT, + YP_LEX_STATE_BIT_CLASS, + YP_LEX_STATE_BIT_LABEL, + YP_LEX_STATE_BIT_LABELED, + YP_LEX_STATE_BIT_FITEM +} yp_lex_state_bit_t; + +// This enum combines the various bits from the above enum into individual +// values that represent the various states of the lexer. +typedef enum { + YP_LEX_STATE_NONE = 0, + YP_LEX_STATE_BEG = (1 << YP_LEX_STATE_BIT_BEG), + YP_LEX_STATE_END = (1 << YP_LEX_STATE_BIT_END), + YP_LEX_STATE_ENDARG = (1 << YP_LEX_STATE_BIT_ENDARG), + YP_LEX_STATE_ENDFN = (1 << YP_LEX_STATE_BIT_ENDFN), + YP_LEX_STATE_ARG = (1 << YP_LEX_STATE_BIT_ARG), + YP_LEX_STATE_CMDARG = (1 << YP_LEX_STATE_BIT_CMDARG), + YP_LEX_STATE_MID = (1 << YP_LEX_STATE_BIT_MID), + YP_LEX_STATE_FNAME = (1 << YP_LEX_STATE_BIT_FNAME), + YP_LEX_STATE_DOT = (1 << YP_LEX_STATE_BIT_DOT), + YP_LEX_STATE_CLASS = (1 << YP_LEX_STATE_BIT_CLASS), + YP_LEX_STATE_LABEL = (1 << YP_LEX_STATE_BIT_LABEL), + YP_LEX_STATE_LABELED = (1 << YP_LEX_STATE_BIT_LABELED), + YP_LEX_STATE_FITEM = (1 << YP_LEX_STATE_BIT_FITEM), + YP_LEX_STATE_BEG_ANY = YP_LEX_STATE_BEG | YP_LEX_STATE_MID | YP_LEX_STATE_CLASS, + YP_LEX_STATE_ARG_ANY = YP_LEX_STATE_ARG | YP_LEX_STATE_CMDARG, + YP_LEX_STATE_END_ANY = YP_LEX_STATE_END | YP_LEX_STATE_ENDARG | YP_LEX_STATE_ENDFN +} yp_lex_state_t; + +typedef enum { + YP_HEREDOC_QUOTE_NONE, + YP_HEREDOC_QUOTE_SINGLE = '\'', + YP_HEREDOC_QUOTE_DOUBLE = '"', + YP_HEREDOC_QUOTE_BACKTICK = '`', +} yp_heredoc_quote_t; + +typedef enum { + YP_HEREDOC_INDENT_NONE, + YP_HEREDOC_INDENT_DASH, + YP_HEREDOC_INDENT_TILDE, +} yp_heredoc_indent_t; + +// When lexing Ruby source, the lexer has a small amount of state to tell which +// kind of token it is currently lexing. For example, when we find the start of +// a string, the first token that we return is a TOKEN_STRING_BEGIN token. After +// that the lexer is now in the YP_LEX_STRING mode, and will return tokens that +// are found as part of a string. +typedef struct yp_lex_mode { + enum { + // This state is used when any given token is being lexed. + YP_LEX_DEFAULT, + + // This state is used when we're lexing as normal but inside an embedded + // expression of a string. + YP_LEX_EMBEXPR, + + // This state is used when we're lexing a variable that is embedded + // directly inside of a string with the # shorthand. + YP_LEX_EMBVAR, + + // This state is used when you are inside the content of a heredoc. + YP_LEX_HEREDOC, + + // This state is used when we are lexing a list of tokens, as in a %w + // word list literal or a %i symbol list literal. + YP_LEX_LIST, + + // This state is used when a regular expression has been begun and we + // are looking for the terminator. + YP_LEX_REGEXP, + + // This state is used when we are lexing a string or a string-like + // token, as in string content with either quote or an xstring. + YP_LEX_STRING + } mode; + + union { + struct { + // This keeps track of the nesting level of the list. + size_t nesting; + + // Whether or not interpolation is allowed in this list. + bool interpolation; + + // When lexing a list, it takes into account balancing the + // terminator if the terminator is one of (), [], {}, or <>. + uint8_t incrementor; + + // This is the terminator of the list literal. + uint8_t terminator; + + // This is the character set that should be used to delimit the + // tokens within the list. + uint8_t breakpoints[11]; + } list; + + struct { + // This keeps track of the nesting level of the regular expression. + size_t nesting; + + // When lexing a regular expression, it takes into account balancing + // the terminator if the terminator is one of (), [], {}, or <>. + uint8_t incrementor; + + // This is the terminator of the regular expression. + uint8_t terminator; + + // This is the character set that should be used to delimit the + // tokens within the regular expression. + uint8_t breakpoints[6]; + } regexp; + + struct { + // This keeps track of the nesting level of the string. + size_t nesting; + + // Whether or not interpolation is allowed in this string. + bool interpolation; + + // Whether or not at the end of the string we should allow a :, + // which would indicate this was a dynamic symbol instead of a + // string. + bool label_allowed; + + // When lexing a string, it takes into account balancing the + // terminator if the terminator is one of (), [], {}, or <>. + uint8_t incrementor; + + // This is the terminator of the string. It is typically either a + // single or double quote. + uint8_t terminator; + + // This is the character set that should be used to delimit the + // tokens within the string. + uint8_t breakpoints[6]; + } string; + + struct { + // These pointers point to the beginning and end of the heredoc + // identifier. + const uint8_t *ident_start; + size_t ident_length; + + yp_heredoc_quote_t quote; + yp_heredoc_indent_t indent; + + // This is the pointer to the character where lexing should resume + // once the heredoc has been completely processed. + const uint8_t *next_start; + } heredoc; + } as; + + // The previous lex state so that it knows how to pop. + struct yp_lex_mode *prev; +} yp_lex_mode_t; + +// We pre-allocate a certain number of lex states in order to avoid having to +// call malloc too many times while parsing. You really shouldn't need more than +// this because you only really nest deeply when doing string interpolation. +#define YP_LEX_STACK_SIZE 4 + +// A forward declaration since our error handler struct accepts a parser for +// each of its function calls. +typedef struct yp_parser yp_parser_t; + +// While parsing, we keep track of a stack of contexts. This is helpful for +// error recovery so that we can pop back to a previous context when we hit a +// token that is understood by a parent context but not by the current context. +typedef enum { + YP_CONTEXT_BEGIN, // a begin statement + YP_CONTEXT_BLOCK_BRACES, // expressions in block arguments using braces + YP_CONTEXT_BLOCK_KEYWORDS, // expressions in block arguments using do..end + YP_CONTEXT_CASE_WHEN, // a case when statements + YP_CONTEXT_CASE_IN, // a case in statements + YP_CONTEXT_CLASS, // a class declaration + YP_CONTEXT_DEF, // a method definition + YP_CONTEXT_DEF_PARAMS, // a method definition's parameters + YP_CONTEXT_DEFAULT_PARAMS, // a method definition's default parameter + YP_CONTEXT_ELSE, // an else clause + YP_CONTEXT_ELSIF, // an elsif clause + YP_CONTEXT_EMBEXPR, // an interpolated expression + YP_CONTEXT_ENSURE, // an ensure statement + YP_CONTEXT_FOR, // a for loop + YP_CONTEXT_IF, // an if statement + YP_CONTEXT_LAMBDA_BRACES, // a lambda expression with braces + YP_CONTEXT_LAMBDA_DO_END, // a lambda expression with do..end + YP_CONTEXT_MAIN, // the top level context + YP_CONTEXT_MODULE, // a module declaration + YP_CONTEXT_PARENS, // a parenthesized expression + YP_CONTEXT_POSTEXE, // an END block + YP_CONTEXT_PREDICATE, // a predicate inside an if/elsif/unless statement + YP_CONTEXT_PREEXE, // a BEGIN block + YP_CONTEXT_RESCUE_ELSE, // a rescue else statement + YP_CONTEXT_RESCUE, // a rescue statement + YP_CONTEXT_SCLASS, // a singleton class definition + YP_CONTEXT_UNLESS, // an unless statement + YP_CONTEXT_UNTIL, // an until statement + YP_CONTEXT_WHILE, // a while statement +} yp_context_t; + +// This is a node in a linked list of contexts. +typedef struct yp_context_node { + yp_context_t context; + struct yp_context_node *prev; +} yp_context_node_t; + +// This is the type of a comment that we've found while parsing. +typedef enum { + YP_COMMENT_INLINE, + YP_COMMENT_EMBDOC, + YP_COMMENT___END__ +} yp_comment_type_t; + +// This is a node in the linked list of comments that we've found while parsing. +typedef struct yp_comment { + yp_list_node_t node; + const uint8_t *start; + const uint8_t *end; + yp_comment_type_t type; +} yp_comment_t; + +// When the encoding that is being used to parse the source is changed by YARP, +// we provide the ability here to call out to a user-defined function. +typedef void (*yp_encoding_changed_callback_t)(yp_parser_t *parser); + +// When an encoding is encountered that isn't understood by YARP, we provide +// the ability here to call out to a user-defined function to get an encoding +// struct. If the function returns something that isn't NULL, we set that to +// our encoding and use it to parse identifiers. +typedef yp_encoding_t *(*yp_encoding_decode_callback_t)(yp_parser_t *parser, const uint8_t *name, size_t width); + +// When you are lexing through a file, the lexer needs all of the information +// that the parser additionally provides (for example, the local table). So if +// you want to properly lex Ruby, you need to actually lex it in the context of +// the parser. In order to provide this functionality, we optionally allow a +// struct to be attached to the parser that calls back out to a user-provided +// callback when each token is lexed. +typedef struct { + // This opaque pointer is used to provide whatever information the user + // deemed necessary to the callback. In our case we use it to pass the array + // that the tokens get appended into. + void *data; + + // This is the callback that is called when a token is lexed. It is passed + // the opaque data pointer, the parser, and the token that was lexed. + void (*callback)(void *data, yp_parser_t *parser, yp_token_t *token); +} yp_lex_callback_t; + +// This struct represents a node in a linked list of scopes. Some scopes can see +// into their parent scopes, while others cannot. +typedef struct yp_scope { + // The IDs of the locals in the given scope. + yp_constant_id_list_t locals; + + // A pointer to the previous scope in the linked list. + struct yp_scope *previous; + + // A boolean indicating whether or not this scope can see into its parent. + // If closed is true, then the scope cannot see into its parent. + bool closed; + + // A boolean indicating whether or not this scope has explicit parameters. + // This is necessary to determine whether or not numbered parameters are + // allowed. + bool explicit_params; + + // A boolean indicating whether or not this scope has numbered parameters. + // This is necessary to determine if child blocks are allowed to use + // numbered parameters. + bool numbered_params; +} yp_scope_t; + +// This struct represents the overall parser. It contains a reference to the +// source file, as well as pointers that indicate where in the source it's +// currently parsing. It also contains the most recent and current token that +// it's considering. +struct yp_parser { + yp_lex_state_t lex_state; // the current state of the lexer + int enclosure_nesting; // tracks the current nesting of (), [], and {} + + // Used to temporarily track the nesting of enclosures to determine if a { + // is the beginning of a lambda following the parameters of a lambda. + int lambda_enclosure_nesting; + + // Used to track the nesting of braces to ensure we get the correct value + // when we are interpolating blocks with braces. + int brace_nesting; + + // the stack used to determine if a do keyword belongs to the predicate of a + // while, until, or for loop + yp_state_stack_t do_loop_stack; + + // the stack used to determine if a do keyword belongs to the beginning of a + // block + yp_state_stack_t accepts_block_stack; + + struct { + yp_lex_mode_t *current; // the current mode of the lexer + yp_lex_mode_t stack[YP_LEX_STACK_SIZE]; // the stack of lexer modes + size_t index; // the current index into the lexer mode stack + } lex_modes; + + const uint8_t *start; // the pointer to the start of the source + const uint8_t *end; // the pointer to the end of the source + yp_token_t previous; // the previous token we were considering + yp_token_t current; // the current token we're considering + + // This is a special field set on the parser when we need the parser to jump + // to a specific location when lexing the next token, as opposed to just + // using the end of the previous token. Normally this is NULL. + const uint8_t *next_start; + + // This field indicates the end of a heredoc whose identifier was found on + // the current line. If another heredoc is found on the same line, then this + // will be moved forward to the end of that heredoc. If no heredocs are + // found on a line then this is NULL. + const uint8_t *heredoc_end; + + yp_list_t comment_list; // the list of comments that have been found while parsing + yp_list_t warning_list; // the list of warnings that have been found while parsing + yp_list_t error_list; // the list of errors that have been found while parsing + yp_scope_t *current_scope; // the current local scope + + yp_context_node_t *current_context; // the current parsing context + + // The encoding functions for the current file is attached to the parser as + // it's parsing so that it can change with a magic comment. + yp_encoding_t encoding; + + // When the encoding that is being used to parse the source is changed by + // YARP, we provide the ability here to call out to a user-defined function. + yp_encoding_changed_callback_t encoding_changed_callback; + + // When an encoding is encountered that isn't understood by YARP, we provide + // the ability here to call out to a user-defined function to get an + // encoding struct. If the function returns something that isn't NULL, we + // set that to our encoding and use it to parse identifiers. + yp_encoding_decode_callback_t encoding_decode_callback; + + // This pointer indicates where a comment must start if it is to be + // considered an encoding comment. + const uint8_t *encoding_comment_start; + + // This is an optional callback that can be attached to the parser that will + // be called whenever a new token is lexed by the parser. + yp_lex_callback_t *lex_callback; + + // This is the path of the file being parsed + // We use the filepath when constructing SourceFileNodes + yp_string_t filepath_string; + + // This constant pool keeps all of the constants defined throughout the file + // so that we can reference them later. + yp_constant_pool_t constant_pool; + + // This is the list of newline offsets in the source file. + yp_newline_list_t newline_list; + + // We want to add a flag to integer nodes that indicates their base. We only + // want to parse these once, but we don't have space on the token itself to + // communicate this information. So we store it here and pass it through + // when we find tokens that we need it for. + yp_node_flags_t integer_base; + + // Whether or not we're at the beginning of a command + bool command_start; + + // Whether or not we're currently recovering from a syntax error + bool recovering; + + // Whether or not the encoding has been changed by a magic comment. We use + // this to provide a fast path for the lexer instead of going through the + // function pointer. + bool encoding_changed; + + // This flag indicates that we are currently parsing a pattern matching + // expression and impacts that calculation of newlines. + bool pattern_matching_newlines; + + // This flag indicates that we are currently parsing a keyword argument. + bool in_keyword_arg; + + // Whether or not the parser has seen a token that has semantic meaning + // (i.e., a token that is not a comment or whitespace). + bool semantic_token_seen; + + // Whether or not we have found a frozen_string_literal magic comment with + // a true value. + bool frozen_string_literal; +}; + +#endif // YARP_PARSER_H |