yacc和lex结合使用

上一篇笔记中,我们已经体验了Yacc和Lex结合使用的方法,但那个例子非常简单,涉及到的Lex/Yacc用法还不足以解决“编写一个最简单的编程语言”这种需求。本篇笔记我们再来观察一个例子,解析简单的SQL语句。当然,SQL语法还算是比较复杂的,而且关键字也比较多,我们这里只是使用一种类似SQL的简化版语法,并非真正的SQL语句。

这里我们给出三条例子语句:

select [ESSN='01'](projection [ESSN,PNAME](WORKS_ON join PROJECT));

projection [BDATE] (select [DNAME='Research' & ENAME='John'] (EMPLOYEE join DEPARTMENT));

select [DNAME='Research' & ENAME='Mary'] (EMPLOYEE join DEPARTMENT);

我们使用中括号表示查询条件,小括号包围的是查询的子句。解析器部分我们的要求不高,写一个能恰好分析上面三条语句的解析器,并打印查询计划树即可。

词法分析器的设计

我们需要解析的Token如下:

SELECT PROJECTION JOIN EQUAL AND CONDITIONL CONDITIONR SUBQUERYL SUBQUERYR COMMA QUOTE STMTEND STRVAL

上面分别是:select关键字,projection关键字,join关键字,等号,与(&)号,左中括号,右中括号,左小括号,右小括号,逗号,单引号,分号(代表语句结束),字符串值。

parser.l

%{
#include <stdio.h>
#include <string.h>
#include "y.tab.h"
#include "parsetree.h"
extern int yylex(void);
%}
%%
select {
    yylval.queryNode = mallocQueryNode();
    yylval.queryNode->type = SELECT;
    snprintf(yylval.queryNode->text, NODE_TEXT_SIZE, "SELECT");
    return SELECT;
}
projection {
    yylval.queryNode = mallocQueryNode();
    yylval.queryNode->type = PROJECTION;
    snprintf(yylval.queryNode->text, NODE_TEXT_SIZE, "PROJECTION");
    return PROJECTION;
}
join {
    yylval.queryNode = mallocQueryNode();
    yylval.queryNode->type = JOIN;
    snprintf(yylval.queryNode->text, NODE_TEXT_SIZE, "JOIN");
    return JOIN;
}
= {
    return EQUAL;
}
& {
    return AND;
}
\[ {
    return CONDITIONL;
}
\] {
    return CONDITIONR;
}
\( {
    return SUBQUERYL;
}
\) {
    return SUBQUERYR;
}
, {
    return COMMA;
}
' {
    return QUOTE;
}
; {
    return STMTEND;
}
[a-zA-Z0-9_]+ {
    char *stringBuffer = (char *) malloc(sizeof(char) * STRBUFFER_SIZE);
    if(stringBuffer == NULL)
    {
        perror("projection_vars:projection_var malloc()");
        exit(EXIT_FAILURE);
    }
    strcpy(stringBuffer, yytext);
    yylval.strVal = stringBuffer;
    return STRVAL;
}
[ \t]+ {}
%%
int yywrap(void)
{
    return 1;
}

头文件我们引用了y.tab.h,这个头文件是Yacc生成的,里面包含Token的宏定义,想要结合Yacc就必须引用这个头文件。

yylval是用来向Yacc传值的。yylval其实是联合体类型,这里我们不多说,看完后面Yacc的代码再回来看yylval就能理解了。传值时,一定要注意字符串的传递方式,字符串最好在堆上分配,然后从yytext拷贝一次,否则会发生意想不到的情况。而传递结构体类型,也是直接malloc的(mallocQueryNode函数就会在堆上分配)。

注意每个正则表达式的匹配动作,我们直接返回了一个Token,其实这些Token是int类型的,他们由yacc生成了宏定义,定义在y.tab.h中。

语法分析器的生成式设计

接下来我们设计解析语法的文法生成式,仅仅满足上面三条语句的话,这个真的很简单,我想不需要多说了。

stmt->select_stmt STMTEND|projection_stmt STMTEND|join_stmt STMTEND

subquery_stmt->select_stmt|projection_stmt|join_stmt

select_stmt->SELECT CONDITIONL exprs CONDITIONR SUBQUERYL subquery_stmt SUBQUERYR

projection_stmt->PROJECTION CONDITIONL projection_vars CONDITIONR SUBQUERYL subquery_stmt SUBQUERYR

join_stmt->STRVAL JOIN STRVAL

exprs->expr|exprs AND expr

expr->STRVAL EQUAL QUOTE STRVAL QUOTE

projection_vars->projection_var|projection_vars COMMA projection_var

projection_var->STRVAL

下面我们根据上面的文法生成式,编写Yacc文件。

parser.y

%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "parsetree.h"
#include "expr.h"
#include "optimiser.h"
void yyerror(char *s);
%}

%union {
    struct QueryNode *queryNode;
    struct Expr *exprNode;
    char *strVal;
}

%token SELECT PROJECTION JOIN EQUAL AND CONDITIONL CONDITIONR SUBQUERYL SUBQUERYR COMMA QUOTE STMTEND STRVAL

%type<queryNode> SELECT PROJECTION JOIN
%type<strVal> STRVAL

%type<queryNode> stmt subquery_stmt select_stmt projection_stmt join_stmt
%type<strVal> expr projection_vars projection_var
%type<exprNode> exprs
%%
stmt: select_stmt STMTEND {
        printf("Parser:select stmt identified\n");
        $$ = $1;
        dumpTree($$);
        $$ = optimise($$);
        dumpTree($$);
    }
    | projection_stmt STMTEND {
        printf("Parser:projection stmt identified\n");
        $$ = $1;
        dumpTree($$);
        $$ = optimise($$);
        dumpTree($$);
    }
    | join_stmt STMTEND {
        printf("Parser:join stmt identified\n");
        $$ = $1;
        dumpTree($$);
        $$ = optimise($$);
        dumpTree($$);
    }
    ;

subquery_stmt: select_stmt {
        $$ = $1;
    }
    | projection_stmt {
        $$ = $1;
    }
    | join_stmt {
        $$ = $1;
    }
    ;

select_stmt: SELECT CONDITIONL exprs CONDITIONR SUBQUERYL subquery_stmt SUBQUERYR {

        struct Expr *p = $3;
        struct QueryNode *root = $6;
        while(p != NULL)
        {
            struct QueryNode *newSelectNode = mallocQueryNode();
            newSelectNode->type = SELECT;
            strcpy(newSelectNode->text, "SELECT ");
            strcat(newSelectNode->text, p->text);

            appendChild(newSelectNode, root);
            root = newSelectNode;

            p = p->next;
        }

        $$ = root;
    }
    ;

projection_stmt: PROJECTION CONDITIONL projection_vars CONDITIONR SUBQUERYL subquery_stmt SUBQUERYR {
        strcat($1->text, " ");
        strcat($1->text, $3);

        appendChild($1, $6);

        $$ = $1;
    }
    ;

join_stmt: STRVAL JOIN STRVAL {
        struct QueryNode *tbChild1 = mallocQueryNode();
        tbChild1->type = STRVAL;
        strcpy(tbChild1->text, $1);

        struct QueryNode *tbChild2 = mallocQueryNode();
        tbChild2->type = STRVAL;
        strcpy(tbChild2->text, $3);

        appendChild($2, tbChild1);
        appendChild($2, tbChild2);

        $$ = $2;
    }
    ;

exprs: expr {
        struct Expr *exprNode = mallocNewExpr();
        strcpy(exprNode->text, $1);

        $$ = exprNode;
    }
    |exprs AND expr {
        struct Expr *exprNode = mallocNewExpr();
        strcpy(exprNode->text, $3);
        appendExprList($1, exprNode);

        $$ = $1;
    }
    ;

expr: STRVAL EQUAL QUOTE STRVAL QUOTE {
        char *stringBuffer = (char *) malloc(sizeof(char) * STRBUFFER_SIZE);
        if(stringBuffer == NULL)
        {
            perror("projection_vars:projection_var malloc()");
            exit(EXIT_FAILURE);
        }
        snprintf(stringBuffer, STRBUFFER_SIZE, "%s='%s'", $1, $4);
        $$ = stringBuffer;
    }
    ;

projection_vars: projection_var {
        char *stringBuffer = (char *) malloc(sizeof(char) * STRBUFFER_SIZE);
        if(stringBuffer == NULL)
        {
            perror("projection_vars:projection_var malloc()");
            exit(EXIT_FAILURE);
        }
        strcpy(stringBuffer, $1);
        $$ = stringBuffer;
    }
    | projection_vars COMMA projection_var {
        char *stringBuffer = (char *) malloc(sizeof(char) * STRBUFFER_SIZE);
        if(stringBuffer == NULL)
        {
            perror("projection_vars:projection_vars COMMA projection_var malloc()");
            exit(EXIT_FAILURE);
        }
        snprintf(stringBuffer, STRBUFFER_SIZE, "%s,%s" , $1, $3);
        $$ = stringBuffer;
    }
    ;

projection_var: STRVAL {
        char *stringBuffer = (char *) malloc(sizeof(char) * STRBUFFER_SIZE);
        if(stringBuffer == NULL)
        {
            perror("projection_var:STRVAL malloc()");
            exit(EXIT_FAILURE);
        }
        strcpy(stringBuffer, $1);
        $$ = stringBuffer;
    }
    ;
%%
extern FILE *yyin;

int main(void)
{
    yyparse();
}

void yyerror(char *s)
{
    fprintf(stderr, "%s\n", s);
}

不要在意那些头文件,它们真的并不复杂,这里我们只需要关注Yacc的用法就行了。

%union是一个联合体,这个联合体其实就是yylval,我们使用这个联合体从Lex向Yacc传递值。

%type这个是定义了一系列终结符和非终结符的数据类型,注意这里数据类型其实是指%union中定义过的变量名,而不是int struct node*什么的。

$$$1$2...这些在文法生成式的动作代码中,用来指代生成式中的符号。$$指生成式左部,$1等指右部各个符号,注意,即使某个Token没有返回值,它也占用一个$n符号。我们的动作代码,基本就是构建逻辑查询计划树了。

总结

到现在为止,我们基本掌握了Lex和Yacc的结合使用,还有一些高级用法没有介绍,不过我们现在掌握的知识已经足够可以开始自由发挥了。

作者:Gacfox
版权声明:本网站为非盈利性质,文章如非特殊说明均为原创,版权遵循知识共享协议CC BY-NC-ND 4.0进行授权,转载必须署名,禁止用于商业目的或演绎修改后转载。
Copyright © 2017-2024 Gacfox All Rights Reserved.
Build with NextJS | Sitemap