使用 Menhir 构建 SimPL 的编译器
Lexer and Parser 语法分析模块
Lexer, Parser, AST 是三个依次耦合的模块,可以这么描述三者的关系:
Lexer ---tokens--> Parser ---nodes--> AST
相对于上面的图像化描述,cs3110 反过来构建整个 Lexer 和 Parser 的结构
在 ast.ml
中,定义了 AST 上节点 node 的类型:
type bop = | Add| Mul | Leq type expr = | Var of string| Int of int | Bool of bool| Binop of bop * expr * expr | If of expr * expr * expr | Let of string * expr * expr
在 parser.mly
中构建的 parser 接受 TOKEN
并将其翻译成 AST 上节点
(* Link the ast and the parser *)
%{open Ast
%}(* The name of tokens *)%token <int> INT
%token <string> ID
%token TRUE
%token FALSE
%token LEQ
%token TIMES
%token PLUS
%token LPAREN
%token RPAREN
%token LET
%token EQUALS
%token IN
%token IF
%token THEN
%token ELSE
%token EOF(* Compute the precedence and associativity of the operators *)
%nonassoc IN
%nonassoc ELSE
%left LEQ
%left PLUS
%left TIMES(* Define the start symbol: where to parse the program? *)
%start <Ast.expr> prog%%prog:| e = expr EOF { e };expr: | i = INT { Int i } (* integer *)| TRUE { Bool true } (* booleans *)| FALSE { Bool false }| e1 = expr Add e2 = expr { Binop (Add, e1, e2) } (* binary operators *)| e1 = expr Mul e2 = expr { Binop (Mul, e1, e2) }| e1 = expr Leq e2 = expr { Binop (Leq, e1, e2) }| LET; x = ID; EQUALS; e1 = expr; IN; e2 = expr { Let (x, e1, e2) } (* let expressions *)| IF; e1 = expr; THEN; e2 = expr; ELSE; e3 = expr { If (e1, e2, e3) } (* if expressions *)| LPAREN; e = expr; RPAREN { e } (* parentheses *);
lexer.mll
接受输入的 token 序列(也即从字符串角度看代码)并将其进行划分,返回 parser.mly
中早就定义好的 TOKEN
{open Parser
}(* Regular expressions for the lexer *)
let white = [' ' '\t']+
let digit = ['0'-'9']
let int = '-'?digit+
let letter = ['a'-'z' 'A'-'Z']
let id = letter (letter|digit)*(* Define the rules of lexer *)
rule read = parse (* 'read lexbuf' reads the next token from lexbuf *)| white { read lexbuf }| "true" { TRUE }| "false" { FALSE }| "<=" { LEQ }| "*" { TIMES }| "+" { PLUS }| "(" { LPAREN }| ")" { RPAREN }| "let" { LET }| "=" { EQUALS }| "in" { IN }| "if" { IF }| "then" { THEN }| "else" { ELSE }(* 'Lexing.lexeme lexbuf' returns the string that was matched *)| id { ID (Lexing.lexeme lexbuf) }(* 'int_of_string' converts a string to an integer *)| int { INT (int_of_string (Lexing.lexeme lexbuf)) }| eof { EOF }
对于 Menhir 工具来说,这样的构建顺序是合理的;毕竟看上去 lexer.mll
中定义 TOKEN
并不是那么方便 😃
语义分析模块
考虑小步语义风格,这方便我们在后面加入类型检查:
open Astlet parse (s : string) : expr = let lexbuf = Lexing.from_string s in let ast = Parser.prog Lexer.read lexbuf inast(** [is_value e] is whether [e] is a value. *)
let is_value e = match e with | Int _ | Bool _ -> true| _ -> false(** [subst e v x] is [e{v/x}]. *)
let subst _ _ _ =failwith "See next section"(** [step] is the [-->] relation, that is, a single step ofevaluation. *)
let rec step : expr -> expr = function| Int _ | Bool _ -> failwith "Does not step"| Var _ -> failwith "Unbound variable"| Binop (bop, e1, e2) when is_value e1 && is_value e2 ->step_bop bop e1 e2| Binop (bop, e1, e2) when is_value e1 ->Binop (bop, e1, step e2)| Binop (bop, e1, e2) -> Binop (bop, step e1, e2)| Let (x, e1, e2) when is_value e1 -> subst e2 e1 x| Let (x, e1, e2) -> Let (x, step e1, e2)| If (Bool true, e2, _) -> e2| If (Bool false, _, e3) -> e3| If (Int _, _, _) -> failwith "Guard of if must have type bool"| If (e1, e2, e3) -> If (step e1, e2, e3)(** [step_bop bop v1 v2] implements the primitive operation[v1 bop v2]. Requires: [v1] and [v2] are both values. *)
and step_bop bop e1 e2 = match bop, e1, e2 with| Add, Int a, Int b -> Int (a + b)| Mult, Int a, Int b -> Int (a * b)| Leq, Int a, Int b -> Bool (a <= b)| _ -> failwith "Operator and operand type mismatch"