Yet Another Compiler
参考lex和yacc的输入格式,参考虎书《现代编译原理-C语言描述》的算法,大力整合优化,实现了LALR(1)的C#生成器(暂命名为bitParser)。
词法分析器
-
根据DFA和最小化DFA分别生成词法分析器代码(状态转换表、保留字、Token类型等)
-
支持全Unicode字符。支持
int.MaxValue
个词法状态。 -
正则表达式支持
/
后缀,便于识别id = refId
这类情况。 -
正则表达式支持
<'Vt'>
单个前缀,便于识别struct type_name
这类情况。 -
正则表达式支持
<signal1, signal2, ..>
多个状态信号作为前缀,便于识别<Comment>[^*\n]*
这样的情况。类似lex,但不完全相同。 -
无须单独列示Token或状态信号,即无须lex中的
%s NUM ID ..
,也无须%x Comment End Text ..
。 -
无须显式书写保留字,即无须lex中的
[(] { return ('('); }
。 -
无须手动编写语义代码,即无须lex中的
#[0-9]+ { CreateNewText(YYText(),YYLeng()); return(token::IDENT); }
,但仍可在自动生成后手动修改。 -
注释详尽。每个状态的每个条件分支上都在注释中说明其正则表达式、关联的Token类型等。
-
生成ε-NFA、NFA、DFA、最小化DFA的状态图(mermaid格式)的文档,便于学习和调试。
-
生成每个Token类型的状态图(mermaid格式)的文档。
语法分析器
-
根据LL(1)、LR(0)、SLR(1)、LALR(1)、LR(1)分别生成语法分析器代码(分析表、规则列表、语法树结点类型等)。
-
支持关联指令
%nonassoc
、%left
、%right
、优先级指令%prec
,自动解决Shift/Reduce、Reduce/Reduce冲突,并在分析表代码的注释中列示之。 -
注释详尽。在注释中列示:冲突数量、已解决数量、未解决数量;每个状态的LR项和lookahead等。
-
生成LL(1)、LR(0)、SLR(1)、LALR(1)、LR(1)的状态图(mermaid格式)和状态表(md文件中的Table)的文档,便于学习和调试。
-
生成nullable、FIRST集、FOLLOW集的文档。
其他
-
支持多行注释指令
%blockComment on/off
和单行注释指令%inlineComment on/off
,默认格式同C语言的/**/
和//
,可自定义其格式。 -
支持Scope范围指令
%validScopeChars
和全局范围指令%validGlobalChars
,默认范围均为[\u0001-\uFFFF]
(即除'\0'
外的全部Unicode字符),可自定义其范围。 -
生成语法结点的
class
类型框架和遍历语法树的框架,提供适用各种语言的格式化算法。可用于格式化、进一步生成中间代码。 -
大力优化,例如生成ANSI C语言的全部代码+文档只需3秒,生成GLGL4.60.8的全部代码+文档只需9秒。
举例-Calc.st
能够处理加减乘除运算的解析器,其文法如下:
input file: Calc.st
Exp : Exp '+' Term| Exp '-' Term| Term ;
Term : Term '*' Factor| Term '/' Factor| Factor ;
Factor : '(' Exp ')'| 'number' ;%%[0-9]+%% 'number' // 示例只处理非负整数的四则运算
点击查看生成的终结点Vt和非终结点Vn代码
/// <summary>
/// Vt types are used both for lexical analyzing and syntax parse.
/// <para>Vt is quoted in ''.</para>
/// <para>Vn types are only for syntax parse.</para>
/// <para>Vn is not quoted in ''.</para>
/// </summary>
public static class st {// fixed st from template file./// <summary>/// 多行注释 multiple line comment /* xxx *//// </summary>public const string blockComment = "'blockComment'";/// <summary>/// 单行注释 single line comment // xxx/// </summary>public const string inlineComment = "'inlineComment'";/// <summary>/// Something wrong within the source code./// </summary>public const string Error = "'×'";// Vt/// <summary>/// '+'/// </summary>public const string @Plus符 = "'+'";/// <summary>/// '-'/// </summary>public const string @Dash符 = "'-'";/// <summary>/// '*'/// </summary>public const string @Asterisk符 = "'*'";/// <summary>/// '/'/// </summary>public const string @Slash符 = "'/'";/// <summary>/// '('/// </summary>public const string @LeftParenthesis符 = "'('";/// <summary>/// ')'/// </summary>public const string @RightParenthesis符 = "')'";/// <summary>/// 'number'/// </summary>public const string @number = "'number'";/// <summary>/// end of token list./// </summary>public const string @终 = "'¥'";// Vn/// <summary>/// Exp/// </summary>public const string @vnExp = "Exp";/// <summary>/// Term/// </summary>public const string @vnTerm = "Term";/// <summary>/// Factor/// </summary>public const string @vnFactor = "Factor";}
生成的词法分析器状态图如下:
点击查看生成的保留字相代码
public static class reservedWord {/// <summary>/// +/// </summary>public const string @Plus符 = "+";/// <summary>/// -/// </summary>public const string @Dash符 = "-";/// <summary>/// */// </summary>public const string @Asterisk符 = "*";/// <summary>/// //// </summary>public const string @Slash符 = "/";/// <summary>/// (/// </summary>public const string @LeftParenthesis符 = "(";/// <summary>/// )/// </summary>public const string @RightParenthesis符 = ")";
}/// <summary>
/// if <paramref name="token"/> is a reserved word, assign correspond type and return true.
/// <para>otherwise, return false.</para>
/// </summary>
/// <param name="token"></param>
/// <returns></returns>
private static bool CheckReservedWord(AnalyzingToken token) {bool isReservedWord = true;switch (token.value) {//case reservedWord.@emptyReservedWord: token.type = st.@empty; break;case reservedWord.@Plus符: token.type = st.@Plus符; break;case reservedWord.@Dash符: token.type = st.@Dash符; break;case reservedWord.@Asterisk符: token.type = st.@Asterisk符; break;case reservedWord.@Slash符: token.type = st.@Slash符; break;case reservedWord.@LeftParenthesis符: token.type = st.@LeftParenthesis符; break;case reservedWord.@RightParenthesis符: token.type = st.@RightParenthesis符; break;default: isReservedWord = false; break;}return isReservedWord;
}
点击查看生成的lexi状态0相关代码
/// <summary>
/// lexicalState0
/// <para>CompilerExp.Lexical●[1 DFA States]</para>
/// </summary>
private static readonly Action<LexicalContext, char> lexicalState0 =
static (context, c) => {if (false) { /* for simpler code generation purpose. */ }/* user-input condition code *//* [0-9] */else if (/* possible Vt : 'number' *//* no possible signal *//* [xxx] scope */'0'/*'\u0030'(48)*/ <= c && c <= '9'/*'\u0039'(57)*/) {BeginToken(context);context.currentState = lexicalState1;}/* user-input condition code *//* \) */else if (/* possible Vt : ')' *//* no possible signal *//* single char */c == ')'/*'\u0029'(41)*/) {BeginToken(context);context.currentState = lexicalState2;}/* user-input condition code *//* \( */else if (/* possible Vt : '(' *//* no possible signal *//* single char */c == '('/*'\u0028'(40)*/) {BeginToken(context);context.currentState = lexicalState3;}/* user-input condition code *//* \/ */else if (/* possible Vt : '/' *//* no possible signal *//* single char */c == '/'/*'\u002F'(47)*/) {BeginToken(context);context.currentState = lexicalState4;}/* user-input condition code *//* \* */else if (/* possible Vt : '*' *//* no possible signal *//* single char */c == '*'/*'\u002A'(42)*/) {BeginToken(context);context.currentState = lexicalState5;}/* user-input condition code *//* - */else if (/* possible Vt : '-' *//* no possible signal *//* single char */c == '-'/*'\u002D'(45)*/) {BeginToken(context);context.currentState = lexicalState6;}/* user-input condition code *//* \+ */else if (/* possible Vt : '+' *//* no possible signal *//* single char */c == '+'/*'\u002B'(43)*/) {BeginToken(context);context.currentState = lexicalState7;}/* deal with everything else. */else if (c == ' ' || c == '\r' || c == '\n' || c == '\t' || c == '\0') {context.currentState = lexicalState0; // skip them.}else { // unexpected char.BeginToken(context);context.tokenEnd = context.cursor; // ExtendToken(context);AcceptToken(st.Error, context);context.currentState = lexicalState0;}
};
点击查看nullable、FIRST集、FOLLOW集
nullable:
[0]: nullable( Exp' ) = False
[1]: nullable( Exp ) = False
[2]: nullable( Term ) = False
[3]: nullable( Factor ) = False
[4]: nullable( '¥' ) = False
[5]: nullable( '+' ) = False
[6]: nullable( '-' ) = False
[7]: nullable( '*' ) = False
[8]: nullable( '/' ) = False
[9]: nullable( '(' ) = False
[10]: nullable( ')' ) = False
[11]: nullable( 'number' ) = FalseFIRST集:
[0]: FIRST( Exp' ) = { '(' 'number' }
[1]: FIRST( Exp ) = { '(' 'number' }
[2]: FIRST( Term ) = { '(' 'number' }
[3]: FIRST( Factor ) = { '(' 'number' }
[4]: FIRST( '¥' ) = { '¥' }
[5]: FIRST( '+' ) = { '+' }
[6]: FIRST( '-' ) = { '-' }
[7]: FIRST( '*' ) = { '*' }
[8]: FIRST( '/' ) = { '/' }
[9]: FIRST( '(' ) = { '(' }
[10]: FIRST( ')' ) = { ')' }
[11]: FIRST( 'number' ) = { 'number' }
[12]: FIRST( Exp '+' Term ) = { '(' 'number' }
[13]: FIRST( Exp '-' Term ) = { '(' 'number' }
[14]: FIRST( Term '*' Factor ) = { '(' 'number' }
[15]: FIRST( Term '/' Factor ) = { '(' 'number' }
[16]: FIRST( '(' Exp ')' ) = { '(' }FOLLOW集:
[0]: FOLLOW( Exp' ) = { '¥' }
[1]: FOLLOW( Exp ) = { '-' ')' '+' '¥' }
[2]: FOLLOW( Term ) = { '-' ')' '*' '/' '+' '¥' }
[3]: FOLLOW( Factor ) = { '-' ')' '*' '/' '+' '¥' }
生成的语法分析器LALR(1)状态图和状态表如下:
状态 | '+' | '-' | '*' | '/' | '(' | ')' | 'number' | '¥' | Exp | Term | Factor |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | S4 | S5 | G1 | G2 | G3 | ||||||
1 | S6 | S7 | ✅ | ||||||||
2 | R[2] | R[2] | S8 | S9 | R[2] | R[2] | |||||
3 | R[5] | R[5] | R[5] | R[5] | R[5] | R[5] | |||||
4 | S4 | S5 | G10 | G2 | G3 | ||||||
5 | R[7] | R[7] | R[7] | R[7] | R[7] | R[7] | |||||
6 | S4 | S5 | G11 | G3 | |||||||
7 | S4 | S5 | G12 | G3 | |||||||
8 | S4 | S5 | G13 | ||||||||
9 | S4 | S5 | G14 | ||||||||
10 | S6 | S7 | S15 | ||||||||
11 | R[0] | R[0] | S8 | S9 | R[0] | R[0] | |||||
12 | R[1] | R[1] | S8 | S9 | R[1] | R[1] | |||||
13 | R[3] | R[3] | R[3] | R[3] | R[3] | R[3] | |||||
14 | R[4] | R[4] | R[4] | R[4] | R[4] | R[4] | |||||
15 | R[6] | R[6] | R[6] | R[6] | R[6] | R[6] |
点击查看生成的遍历语法树提取结点信息的代码
/// <summary>
/// <see cref="LRNode.type"/> -> <see cref="Action{LRNode, TContext{Exp}}"/>
/// </summary>
private static readonly Dictionary<string/*LRNode.type*/,Action<LRNode, TContext<Exp>>> @expExtractorDict = new();private static readonly Action<LRNode, TContext<Exp>> VtHandler =
(node, context) => {var token = node.start;context.objStack.Push(token);
};/// <summary>
/// initialize dict for extractor.
/// </summary>
private static void InitializeExtractorDict() {var extractorDict = @expExtractorDict;extractorDict.Add(st.@Plus符, VtHandler);extractorDict.Add(st.@Dash符, VtHandler);extractorDict.Add(st.@Asterisk符, VtHandler);extractorDict.Add(st.@Slash符, VtHandler);extractorDict.Add(st.@LeftParenthesis符, VtHandler);extractorDict.Add(st.@RightParenthesis符, VtHandler);extractorDict.Add(st.@number, VtHandler);extractorDict.Add(st.@终,static (node, context) => {// [-1]=Exp' : Exp ;// dumped by ExternalExtractorvar @final = (VnExp?)context.objStack.Pop();var left = new Exp(@final);context.result = left; // final step, no need to push into stack.}); // end of extractorDict.Add(st.@终, (node, context) => { ... });extractorDict.Add(st.@vnExp,static (node, context) => {switch (node.regulation.index) {case 0: { // [0]=Exp : Exp '+' Term ;// dumped by ListExtractor 2var r0 = (VnTerm?)context.objStack.Pop();var r1 = (Token?)context.objStack.Pop();var r2 = (VnExp?)context.objStack.Pop();var left = r2;left.Add(r1, r0);context.objStack.Push(left);}break;case 1: { // [1]=Exp : Exp '-' Term ;// dumped by ListExtractor 2var r0 = (VnTerm?)context.objStack.Pop();var r1 = (Token?)context.objStack.Pop();var r2 = (VnExp?)context.objStack.Pop();var left = r2;left.Add(r1, r0);context.objStack.Push(left);}break;case 2: { // [2]=Exp : Term ;// dumped by ListExtractor 1var r0 = (VnTerm?)context.objStack.Pop();var left = new VnExp(r0);context.objStack.Push(left);}break;default: throw new NotImplementedException();}}); // end of extractorDict.Add(st.@vnExp, (node, context) => { ... });extractorDict.Add(st.@vnTerm,static (node, context) => {switch (node.regulation.index) {case 3: { // [3]=Term : Term '*' Factor ;// dumped by ListExtractor 2var r0 = (VnFactor?)context.objStack.Pop();var r1 = (Token?)context.objStack.Pop();var r2 = (VnTerm?)context.objStack.Pop();var left = r2;left.Add(r1, r0);context.objStack.Push(left);}break;case 4: { // [4]=Term : Term '/' Factor ;// dumped by ListExtractor 2var r0 = (VnFactor?)context.objStack.Pop();var r1 = (Token?)context.objStack.Pop();var r2 = (VnTerm?)context.objStack.Pop();var left = r2;left.Add(r1, r0);context.objStack.Push(left);}break;case 5: { // [5]=Term : Factor ;// dumped by ListExtractor 1var r0 = (VnFactor?)context.objStack.Pop();var left = new VnTerm(r0);context.objStack.Push(left);}break;default: throw new NotImplementedException();}}); // end of extractorDict.Add(st.@vnTerm, (node, context) => { ... });extractorDict.Add(st.@vnFactor,static (node, context) => {switch (node.regulation.index) {case 6: { // [6]=Factor : '(' Exp ')' ;// dumped by DefaultExtractorvar r0 = (Token?)context.objStack.Pop();var r1 = (VnExp?)context.objStack.Pop();var r2 = (Token?)context.objStack.Pop();var left = new VnFactor(r2, r1, r0);context.objStack.Push(left);}break;case 7: { // [7]=Factor : 'number' ;// dumped by DefaultExtractorvar r0 = (Token?)context.objStack.Pop();var left = new VnFactor(r0);context.objStack.Push(left);}break;default: throw new NotImplementedException();}}); // end of extractorDict.Add(st.@vnFactor, (node, context) => { ... });
}
关于通用的格式化算法,可参考我的另一篇文章(一个GLSL Shader的格式化算法(LALR解析器))。