import re # --- Tokenizer --- TOKEN_REGEX = [ ('KEYWORD', r'\b(and|break|do|else|elseif|end|false|for|function|goto|if|in|local|nil|not|or|repeat|return|then|true|until|while)\b'), ('NAME', r'[A-Za-z_][A-Za-z0-9_]*'), ('NUMBER', r'\d+(\.\d+)?'), ('STRING', r'"([^"\\]|\\.)*"|\'([^\'\\]|\\.)*\''), ('SYMBOL', r'==|~=|<=|>=|\.{2}|[+\-*/%^#=<>;:,.\[\](){}]'), ('COMMENT', r'--\[=*?\[.*?\]\=*?\]|--.*'), ('SKIP', r'[ \t\r\n]+'), ('MISMATCH', r'.'), ] TOKEN_RE = re.compile('|'.join(f'(?P<{name}>{pattern})' for name, pattern in TOKEN_REGEX), re.DOTALL) def tokenize(code): for match in TOKEN_RE.finditer(code): kind = match.lastgroup value = match.group() if kind in ('SKIP', 'COMMENT'): continue elif kind == 'MISMATCH': raise SyntaxError(f'Unexpected token: {value}') else: yield kind, value # --- Parser --- class LuaParser: def __init__(self, tokens): self.tokens = list(tokens) self.pos = 0 def peek(self): return self.tokens[self.pos] if self.pos < len(self.tokens) else (None, None) def eat(self, kind=None, value=None): token = self.peek() if kind and token[0] != kind: raise SyntaxError(f'Expected {kind}, got {token[0]}') if value and token[1] != value: raise SyntaxError(f'Expected {value}, got {token[1]}') self.pos += 1 return token def parse(self): return self.parse_block() def parse_block(self): block = [] while self.pos < len(self.tokens): kind, value = self.peek() if value in ('end', 'elseif', 'else', 'until'): break stmt = self.parse_statement() block.append(stmt) return ('block', block) def parse_statement(self): kind, value = self.peek() if value == 'local': return self.parse_local() elif value == 'function': return self.parse_function() elif value == 'if': return self.parse_if() elif value == 'while': return self.parse_while() elif value == 'return': return self.parse_return() elif kind == 'NAME': return self.parse_assignment_or_call() else: raise SyntaxError(f'Unexpected statement: {value}') def parse_local(self): self.eat('KEYWORD', 'local') name = self.eat('NAME')[1] if self.peek()[1] == '=': self.eat('SYMBOL', '=') expr = self.parse_expression() return ('local', name, expr) return ('local', name, None) def parse_function(self): self.eat('KEYWORD', 'function') name = self.eat('NAME')[1] self.eat('SYMBOL', '(') args = [] while self.peek()[1] != ')': if args: self.eat('SYMBOL', ',') args.append(self.eat('NAME')[1]) self.eat('SYMBOL', ')') body = self.parse_block() self.eat('KEYWORD', 'end') return ('function', name, args, body) def parse_if(self): self.eat('KEYWORD', 'if') cond = self.parse_expression() self.eat('KEYWORD', 'then') then_block = self.parse_block() elseif_blocks = [] while self.peek()[1] == 'elseif': self.eat('KEYWORD', 'elseif') elseif_cond = self.parse_expression() self.eat('KEYWORD', 'then') elseif_block = self.parse_block() elseif_blocks.append((elseif_cond, elseif_block)) else_block = None if self.peek()[1] == 'else': self.eat('KEYWORD', 'else') else_block = self.parse_block() self.eat('KEYWORD', 'end') return ('if', cond, then_block, elseif_blocks, else_block) def parse_while(self): self.eat('KEYWORD', 'while') cond = self.parse_expression() self.eat('KEYWORD', 'do') body = self.parse_block() self.eat('KEYWORD', 'end') return ('while', cond, body) def parse_return(self): self.eat('KEYWORD', 'return') expr = self.parse_expression() return ('return', expr) def parse_assignment_or_call(self): name = self.eat('NAME')[1] if self.peek()[1] == '=': self.eat('SYMBOL', '=') expr = self.parse_expression() return ('assign', name, expr) elif self.peek()[1] == '(': self.eat('SYMBOL', '(') args = [] while self.peek()[1] != ')': if args: self.eat('SYMBOL', ',') args.append(self.parse_expression()) self.eat('SYMBOL', ')') return ('call', name, args) else: raise SyntaxError(f'Unexpected token after identifier: {self.peek()}') def parse_expression(self, precedence=0): expr = self.parse_primary() while True: kind, op = self.peek() if kind != 'SYMBOL' and kind != 'KEYWORD': break prec = self.get_precedence(op) if prec < precedence: break self.eat() right = self.parse_expression(prec + 1) expr = ('binop', op, expr, right) return expr def parse_primary(self): kind, value = self.peek() if kind == 'NUMBER': return ('number', float(self.eat()[1])) elif kind == 'STRING': return ('string', self.eat()[1]) elif kind == 'NAME': return ('name', self.eat()[1]) elif value == '(': self.eat('SYMBOL', '(') expr = self.parse_expression() self.eat('SYMBOL', ')') return expr elif value == '{': return self.parse_table() elif value == 'nil': self.eat('KEYWORD', 'nil') return ('nil',) elif value == 'true': self.eat('KEYWORD', 'true') return ('bool', True) elif value == 'false': self.eat('KEYWORD', 'false') return ('bool', False) else: raise SyntaxError(f'Unexpected token in expression: {value}') def parse_table(self): self.eat('SYMBOL', '{') fields = [] while self.peek()[1] != '}': if fields: self.eat('SYMBOL', ',') fields.append(self.parse_expression()) self.eat('SYMBOL', '}') return ('table', fields) def get_precedence(self, op): return { 'or': 1, 'and': 2, '<': 3, '>': 3, '<=': 3, '>=': 3, '==': 3, '~=': 3, '..': 4, '+': 5, '-': 5, '*': 6, '/': 6, '%': 6, }.get(op, -1) # --- Test --- if __name__ == '__main__': code = """ local x = 1 local y = x + 2 * 3 function hello(a, b) if a > b then return a elseif b > a then return b else return 0 end end while x < 10 do x = x + 1 print(x) end """ tokens = tokenize(code) parser = LuaParser(tokens) ast = parser.parse() from pprint import pprint pprint(ast)