Project 1: 词法分析和语法分析

在这个项目中,你将为BUPT编程语言(BPL)实现一个编译器的词法分析器和语法分析器。BPL是一种类C的编程语言,它去除了C标准中的大多数高级特性,比如宏和指针。项目的目标是编译BPL程序到MIPS32汇编代码。以下是关于词法分析和语法分析部分的详细说明。

1. 词法分析 (Lexical Analysis):

  • Flex工具: 你将使用GNU Flex生成词法分析器。Flex是一个快速的词法分析器生成器。你需要指定要匹配的令牌模式和对每个令牌应用的操作。Flex会将这些规格转换成一个NFA,然后将它转换成一个等价的DFA,并尽可能地最小化这个自动机,最终生成实现词法分析器的C代码。
  • Flex编码: 在使用Flex时,你将使用正则表达式来定义模式,并使用C代码来定义动作。你将编写.lex文件,并通过Flex生成一个C源文件,该文件包含一个实现所有规则和相应动作的有限自动机。
  • Flex特性: Flex还提供了一些有用的特性,如行号记录(yylineno)、输入和输出函数(input()和unput()),以及更多用于控制词法分析过程的函数。

2. 语法分析 (Syntax Analysis):

  • Bison工具: 你将使用GNU Bison生成语法分析器。Bison是一个解析器生成器,你提供一个文法规范的输入,Bison将生成一个LALR(1)解析器来识别该文法的句子。这个解析器接受来自Flex的输入令牌流,以识别指定的上下文无关文法。
  • Bison编码: 类似于Flex,Bison的源代码也包含可选的声明、定义和用户例程部分。你将为每个产生式关联一个动作,这允许你在使用该产生式进行规约时执行任何处理。你需要定义文法的每个非终结符,为每个规则指定优先级和结合性,并设置用于在词法分析器和解析器之间通信的全局变量。
  • Bison特性: Bison还提供了一些高级特性,如符号位置信息、错误恢复和冲突解决机制等。这些特性可以帮助你更有效地实现语法分析器,并提供更稳健的错误处理和更精确的语法分析。

项目要求:

  • 你需要实现一个解析器,它接受单个命令行参数(BPL文件路径),并输出语法有效的BPL程序的语法树或报告代码中存在的所有词法/语法错误。
  • 你的解析器应该能够识别未定义字符或标记、结构非法等词法和语法错误,并为语法有效的BPL程序打印出其语法树。
  • 你可以实现其他功能,如单行/多行注释、宏预处理器、文件包含、for语句等。

通过这个项目,你将学会如何使用现代的编译器工具(Flex和Bison)来自动化词法分析和语法分析过程,从而更深入地理解编译器前端的工作原理。同时,这也是构建编译器其他部分(如语义分析、中间代码生成、目标代码生成等)的基础。

lex.l

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
%{
#include"syntax.tab.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>

typedef struct Head
{
int is_end;
char type[100];
char id[100];
int line;
struct Head *child;
struct Head *brother;
} Node;
struct Head *new_Node(int is_end, char *type, char *id,int line, Node *child, Node *brother);

Node* new_Node(int is_end,char* type, char* id, int line, Node* child,Node* brother)
{
Node* newNode = (Node*)malloc(sizeof(Node));
if (!newNode)
{
perror("Memory allocation error");
exit(EXIT_FAILURE);
}
newNode->is_end = is_end;
if(line==0)
{
newNode->line=child->line;
}
else
{
newNode->line=line;
}
strcpy(newNode->type,type);
if (id)
{
strcpy(newNode->id,id);
}
else
{
newNode->id[0]='\0';
}
newNode->child = child;
newNode->brother = brother;
return newNode;
}

void print_tree(Node* result,int line)
{
if(result==NULL)
{
return;
}
if(strcmp(result->type,"empty")!=0)
{
for(int i=0;i<line;i++)
{
printf(" ");
}
}

if(result->is_end==1)
{
printf("%s",result->type);
if(strcmp(result->type,"ID")==0||strcmp(result->type,"TYPE")==0||strcmp(result->type,"INT")==0||
strcmp(result->type,"FLOAT")==0||strcmp(result->type,"CHAR")==0)
{
printf(": %s",result->id);
}
printf("\n");
}
else if(strcmp(result->type,"empty")!=0)
{
printf("%s (%d)\n",result->type,result->line);
}
print_tree(result->child,line+1);
print_tree(result->brother,line);
}
int line = 1;
%}
%option noyywrap

word_int [0-9]+|0x[0-9a-fA-F]+
word_float [0-9]+"."[0-9]+
word_id [a-zA-Z_][a-zA-Z0-9_]*
word_char \'.\'|\'\\x[0-9a-fA-F]{2}\'

%%
"int" { yylval.node=new_Node(1,"TYPE",yytext,line,NULL,NULL);return TYPE; }
"float" { yylval.node=new_Node(1,"TYPE",yytext,line,NULL,NULL);return TYPE; }
"char" { yylval.node=new_Node(1,"TYPE",yytext,line,NULL,NULL);return TYPE; }
"struct" { yylval.node=new_Node(1,"STRUCT",yytext,line,NULL,NULL);return STRUCT; }
"if" { yylval.node=new_Node(1,"IF",yytext,line,NULL,NULL);return IF; }
"else" { yylval.node=new_Node(1,"ELSE",yytext,line,NULL,NULL);return ELSE; }
"while" { yylval.node=new_Node(1,"WHILE",yytext,line,NULL,NULL);return WHILE; }
"return" { yylval.node=new_Node(1,"RETURN",yytext,line,NULL,NULL);return RETURN; }
{word_int} {yylval.node=new_Node(1,"INT",yytext,line,NULL,NULL);return INT;}
{word_id} { yylval.node=new_Node(1,"ID",yytext,line,NULL,NULL);return ID; }
{word_char} { yylval.node=new_Node(1,"CHAR",yytext,line,NULL,NULL);return CHAR; }
{word_float} { yylval.node=new_Node(1,"FLOAT",yytext,line,NULL,NULL);return FLOAT; }

"." { yylval.node=new_Node(1,"DOT",yytext,line,NULL,NULL);return DOT; }
";" { yylval.node=new_Node(1,"SEMI",yytext,line,NULL,NULL);return SEMI; }
"," { yylval.node=new_Node(1,"COMMA",yytext,line,NULL,NULL);return COMMA; }
"=" { yylval.node=new_Node(1,"ASSIGN",yytext,line,NULL,NULL);return ASSIGN; }
"<" { yylval.node=new_Node(1,"LT",yytext,line,NULL,NULL);return LT; }
">" { yylval.node=new_Node(1,"GT",yytext,line,NULL,NULL);return GT; }
"<=" { yylval.node=new_Node(1,"LE",yytext,line,NULL,NULL);return LE; }
">=" { yylval.node=new_Node(1,"GE",yytext,line,NULL,NULL);return GE; }
"!=" { yylval.node=new_Node(1,"NE",yytext,line,NULL,NULL);return NE; }
"==" { yylval.node=new_Node(1,"EQ",yytext,line,NULL,NULL);return EQ; }
"+" { yylval.node=new_Node(1,"PLUS",yytext,line,NULL,NULL);return PLUS; }
"-" { yylval.node=new_Node(1,"MINUS",yytext,line,NULL,NULL);return MINUS; }
"*" { yylval.node=new_Node(1,"MUL",yytext,line,NULL,NULL);return MUL; }
"/" { yylval.node=new_Node(1,"DIV",yytext,line,NULL,NULL);return DIV; }
"&&" { yylval.node=new_Node(1,"AND",yytext,line,NULL,NULL);return AND; }
"||" { yylval.node=new_Node(1,"OR",yytext,line,NULL,NULL);return OR; }
"!" { yylval.node=new_Node(1,"NOT",yytext,line,NULL,NULL);return NOT; }
"(" { yylval.node=new_Node(1,"LP",yytext,line,NULL,NULL);return LP; }
")" { yylval.node=new_Node(1,"RP",yytext,line,NULL,NULL);return RP; }
"[" { yylval.node=new_Node(1,"LB",yytext,line,NULL,NULL);return LB; }
"]" { yylval.node=new_Node(1,"RB",yytext,line,NULL,NULL);return RB; }
"{" { yylval.node=new_Node(1,"LC",yytext,line,NULL,NULL);return LC; }
"}" { yylval.node=new_Node(1,"RC",yytext,line,NULL,NULL);return RC; }

"//".*$ { /* ignore comments */ }
"/*"([^*]|[*][^/]|[*][*]+[^*/])*"*/" { for (size_t i = 0; i < strlen(yytext); i++) if (yytext[i] == '\n') line++; }
[ \t\r]+ { /* ignore whitespace */ }
"\n" {line++;}

0x([0-9a-fA-F]|[g-zG-Z])+ { yylval.node=new_Node(1,"ILLEGAL_HEX_INT",yytext,line,NULL,NULL); return ILLEGAL_HEX_INT; }
[0-9]{word_id} { yylval.node=new_Node(1,"ILLEGAL_ID",yytext,line,NULL,NULL); return ILLEGAL_ID; }
'\\x.*' { yylval.node=new_Node(1,"ILLEGAL_CHAR",yytext,line,NULL,NULL); return ILLEGAL_CHAR; }
. { yylval.node = new_Node(1,"ILLEGAL",yytext,line,NULL,NULL); return ILLEGAL; }
%%


syntax.y

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
%{
#include "lex.yy.c"

void yyerror();
Node* result=NULL;
int flag=0;
%}

%union
{
struct Head* node; /*"struct" is indispensable*/
}

%token <node> ID INT FLOAT CHAR STRUCT RETURN IF ELSE WHILE PLUS MINUS MUL DIV AND OR LT LE GT GE NE EQ NOT ASSIGN TYPE LP RP LB RB LC RC SEMI COMMA DOT ILLEGAL ILLEGAL_ID ILLEGAL_HEX_INT ILLEGAL_CHAR
%type <node> Program ExtDefList ExtDef ExtDecList Specifier StructSpecifier VarDec FunDec VarList ParamDec CompSt StmtList Stmt DefList Def DecList Dec Exp Args
%%

Program: ExtDefList {$$=new_Node(0,"Program","",0,$1,NULL);result=$$;}

ExtDefList: ExtDef ExtDefList {$1->brother = $2;$$=new_Node(0,"ExtDefList","",0,$1,NULL);}
| /* empty */ {$$=new_Node(0,"empty","",100,NULL,NULL);}

ExtDef: Specifier ExtDecList SEMI {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"ExtDef","",0,$1,NULL);}
| Specifier ExtDecList error {printf("Error type B at line %d: missing semicolon ';'\n", $1->line);}
| Specifier SEMI {$1->brother = $2;$$=new_Node(0,"ExtDef","",0,$1,NULL);}
| Specifier FunDec CompSt {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"ExtDef","",0,$1,NULL);}

ExtDecList: VarDec {$$=new_Node(0,"ExtDecList","",0,$1,NULL);}
| VarDec COMMA ExtDecList {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"ExtDecList","",0,$1,NULL);}

Specifier: TYPE {$$=new_Node(0,"Specifier","",0,$1,NULL);}
| StructSpecifier {$$=new_Node(0,"Specifier","",0,$1,NULL);}

StructSpecifier: STRUCT ID LC DefList RC {$1->brother = $2;$2->brother = $3;$3->brother = $4;$4->brother = $5;$$=new_Node(0,"StructSpecifier","",0,$1,NULL);}
| STRUCT ID {$1->brother = $2;$$=new_Node(0,"StructSpecifier","",0,$1,NULL);}

VarDec: ID {$$=new_Node(0,"VarDec","",0,$1,NULL);}
| ILLEGAL_ID error {printf("Error type A at line %d: illegal identifier '%s'\n", $1->line,$1->id);}
| VarDec LB INT RB {$1->brother = $2;$2->brother = $3;$3->brother = $4;$$=new_Node(0,"VarDec","",0,$1,NULL);}

FunDec: ID LP VarList RP {$1->brother = $2;$2->brother = $3;$3->brother = $4;$$=new_Node(0,"FunDec","",0,$1,NULL);}
| ID LP RP {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"FunDec","",0,$1,NULL);}
| ID LP VarList error {printf("Error type B at line %d: missing closing symbols ')'\n", $1->line);}
| ID LP error {printf("Error type B at line %d: missing closing symbols ')'\n", $1->line);}

VarList: ParamDec COMMA VarList {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"VarList","",0,$1,NULL);}
| ParamDec {$$=new_Node(0,"VarList","",0,$1,NULL);}

ParamDec: Specifier VarDec {$1->brother = $2;$$=new_Node(0,"ParamDec","",0,$1,NULL);}

CompSt: LC DefList StmtList RC {$1->brother = $2;$2->brother = $3;$3->brother = $4;$$=new_Node(0,"CompSt","",0,$1,NULL);}

StmtList: Stmt StmtList {$1->brother = $2;$$=new_Node(0,"StmtList","",0,$1,NULL);}
|Stmt Def StmtList error {printf("Error type B at line %d: definition after statement\n", $2->line);}
| /* empty */ {$$=new_Node(0,"empty","",100,NULL,NULL);}

Stmt: Exp SEMI {$1->brother = $2;$$=new_Node(0,"Stmt","",0,$1,NULL);}
| CompSt {$$=new_Node(0,"Stmt","",0,$1,NULL);}
| RETURN Exp SEMI {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Stmt","",0,$1,NULL);}
| RETURN Exp error {printf("Error type B at line %d: missing semicolon ';'\n", $1->line);}
| IF LP Exp RP Stmt {$1->brother = $2;$2->brother = $3;$3->brother = $4;$4->brother = $5;$$=new_Node(0,"Stmt","",0,$1,NULL);}
| IF LP Exp RP Stmt ELSE Stmt {$1->brother = $2;$2->brother = $3;$3->brother = $4;$4->brother = $5;$5->brother = $6;$6->brother = $7;$$=new_Node(0,"Stmt","",0,$1,NULL);}
| WHILE LP Exp RP Stmt {$1->brother = $2;$2->brother = $3;$3->brother = $4;$4->brother = $5;$$=new_Node(0,"Stmt","",0,$1,NULL);}

DefList: Def DefList {$1->brother = $2;$$=new_Node(0,"DefList","",0,$1,NULL);}
| /* empty */ {$$=new_Node(0,"empty","",100,NULL,NULL);}

Def: Specifier DecList SEMI {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Def","",0,$1,NULL);}
| Specifier DecList error {printf("Error type B at line %d: missing semicolon ';'\n", $1->line);}

DecList: Dec {$$=new_Node(0,"DecList","",0,$1,NULL);}
| Dec COMMA DecList {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"DecList","",0,$1,NULL);}

Dec: VarDec {$$=new_Node(0,"Dec","",0,$1,NULL);}
| VarDec ASSIGN Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Dec","",0,$1,NULL);}

Exp: Exp ASSIGN Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp AND Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp OR Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp LT Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp LE Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp GT Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp GE Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp NE Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp EQ Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp PLUS Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp MINUS Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp MUL Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp DIV Exp {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| LP Exp RP {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| LP Exp error {printf("Error type B at line %d: missing closing symbols ')'\n", $1->line);}
| MINUS Exp {$1->brother = $2;$$=new_Node(0,"Exp","",0,$1,NULL);}
| NOT Exp {$1->brother = $2;$$=new_Node(0,"Exp","",0,$1,NULL);}
| ID LP Args RP {$1->brother = $2;$2->brother = $3;$3->brother = $4;$$=new_Node(0,"Exp","",0,$1,NULL);}
| ID LP Args error {printf("Error type B at line %d: missing closing symbols ')'\n", $1->line);}
| ID LP RP {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp LB Exp RB {$1->brother = $2;$2->brother = $3;$3->brother = $4;$$=new_Node(0,"Exp","",0,$1,NULL);}
| Exp DOT ID {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Exp","",0,$1,NULL);}
| ID {$$=new_Node(0,"Exp","",0,$1,NULL);}
| INT {$$=new_Node(0,"Exp","",0,$1,NULL);}
| FLOAT {$$=new_Node(0,"Exp","",0,$1,NULL);}
| CHAR {$$=new_Node(0,"Exp","",0,$1,NULL);}
| ILLEGAL error {printf("Error type A at line %d: illegal character '%s'\n", $1->line,$1->id);}
| ILLEGAL_HEX_INT error {printf("Error type A at line %d: illegal hexadecimal integer '%s'\n", $1->line,$1->id);}
| ILLEGAL_CHAR error {printf("Error type A at line %d: illegal hex_character '%s'\n", $1->line,$1->id);}
| Exp ILLEGAL Exp error {printf("Error type A at line %d: illegal operator '%s'\n", $2->line,$2->id);}

Args: Exp COMMA Args {$1->brother = $2;$2->brother = $3;$$=new_Node(0,"Args","",0,$1,NULL);}
| Exp {$$=new_Node(0,"Args","",0,$1,NULL);}

%%


void yyerror() {
flag=1;
return ;
}

int main(int argc, char **argv) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <file_path>\n", argv[0]);
exit(-1);
} else if (!(yyin = fopen(argv[1], "r"))) {
perror(argv[1]);
exit(-1);
}
yyparse();
if(flag==0)
{
print_tree(result,0);
}
return 0;
}

Makefile

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
CC=gcc
FLEX=flex
BISON=bison


.lex: lex.l
$(FLEX) lex.l
.syntax: syntax.y
$(BISON) -t -d syntax.y
bplc: .lex .syntax
$(CC) syntax.tab.c -lfl -ly -o bplc.out
@mkdir bin
touch bin/bplc
@chmod +x bin/bplc
clean:
@rm -f lex.yy.c syntax.tab.* *.out
@rm -rf bin/
.PHONY: bplc

bplc_test.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import pathlib
import re
import subprocess


DATA = pathlib.Path('test')


def jsonparser_output(json_file):
out = subprocess.check_output(['./bplc.out', json_file])
return out.decode().strip()

def check_jsonchecker_fail_syntaxonly():
data = DATA
for bplfile in data.glob('*.bpl'):
out = jsonparser_output(bplfile)
print(f'For file {bplfile.name}:')
print(out)
print('='*60)
subfolder_name = "out"
temp=bplfile.name[0:10]
file_name = f"./{subfolder_name}/"+temp+".out"
with open(file_name, 'w') as file:
file.write(out)



# check_jsonchecker_fail_withlexical()
check_jsonchecker_fail_syntaxonly()

README.md

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# project1实验说明

该项目中完成了词法分析器和语法分析器的编写,可以通过终端指令实现bplc的编译和运行。

## 生成`bplc.out`文件

生成指令

```
make bplc
```

成功运行后会反馈以下信息

```
flex lex.1
bison -t -d syntax.y
syntax.y: warning: 241 shift/reduce conflicts [-Wconflicts-sr]
gcc syntax.tab.c -1f1 -ly -o bplc.out
```

并在根目录下生成`bplc.out`文件

## 运行编译检测功能

执行python指令一键运行

```
python3 bplc_test.py
```

运行结果会在终端反馈生成树和报错信息,并将每个文件的反馈信息以`.out`的形式保存在`\out`目录下