From 4e408f29f4b56adc9287bdf5373f8f283f5799b8 Mon Sep 17 00:00:00 2001 From: lensferno Date: Tue, 16 May 2023 21:43:59 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AD=98=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 28 +++ .idea/.gitignore | 8 + .idea/codeStyles/codeStyleConfig.xml | 5 + .idea/deployment.xml | 14 ++ .idea/lexer_cpp.iml | 2 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/statistic.xml | 12 ++ CMakeLists.txt | 6 + grammar.txt | 11 ++ lexer/Lexer.cpp | 217 ++++++++++++++++++++++ lexer/Lexer.h | 45 +++++ lexer/Token.cpp | 29 +++ lexer/Token.h | 75 ++++++++ main.cpp | 50 +++++ parser/Checker.cpp | 143 +++++++++++++++ parser/Checker.h | 26 +++ parser/grammar/AnalysisTableBuilder.cpp | 58 ++++++ parser/grammar/First.cpp | 53 ++++++ parser/grammar/Follow.cpp | 187 +++++++++++++++++++ parser/grammar/Grammar.cpp | 234 ++++++++++++++++++++++++ parser/grammar/Grammar.h | 119 ++++++++++++ parser/grammar/Select.cpp | 72 ++++++++ parser/test.cpp | 80 ++++++++ testCode.txt | 0 25 files changed, 1486 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/.gitignore create mode 100644 .idea/codeStyles/codeStyleConfig.xml create mode 100644 .idea/deployment.xml create mode 100644 .idea/lexer_cpp.iml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/statistic.xml create mode 100644 CMakeLists.txt create mode 100644 grammar.txt create mode 100644 lexer/Lexer.cpp create mode 100644 lexer/Lexer.h create mode 100644 lexer/Token.cpp create mode 100644 lexer/Token.h create mode 100644 main.cpp create mode 100644 parser/Checker.cpp create mode 100644 parser/Checker.h create mode 100644 parser/grammar/AnalysisTableBuilder.cpp create mode 100644 parser/grammar/First.cpp create mode 100644 parser/grammar/Follow.cpp create mode 100644 parser/grammar/Grammar.cpp create mode 100644 parser/grammar/Grammar.h create mode 100644 parser/grammar/Select.cpp create mode 100644 parser/test.cpp create mode 100644 testCode.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..18ce97b --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# CMake +cmake-build-*/ + +out/ + +.vs/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 0000000..a55e7a1 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/deployment.xml b/.idea/deployment.xml new file mode 100644 index 0000000..af7bcdf --- /dev/null +++ b/.idea/deployment.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/lexer_cpp.iml b/.idea/lexer_cpp.iml new file mode 100644 index 0000000..f08604b --- /dev/null +++ b/.idea/lexer_cpp.iml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..79b3c94 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..e1ac88b --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/statistic.xml b/.idea/statistic.xml new file mode 100644 index 0000000..2630bd6 --- /dev/null +++ b/.idea/statistic.xml @@ -0,0 +1,12 @@ + + + + + + \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..9f1f6f6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,6 @@ +cmake_minimum_required(VERSION 3.21) +project(lexer_cpp) + +set(CMAKE_CXX_STANDARD 23) + +add_executable(lexer_cpp main.cpp lexer/Lexer.cpp lexer/Lexer.h lexer/Token.cpp lexer/Token.h parser/Checker.cpp parser/Checker.h parser/Checker.h parser/grammar/First.cpp parser/grammar/Grammar.cpp parser/grammar/Grammar.h parser/test.cpp parser/grammar/Follow.cpp parser/grammar/Select.cpp parser/grammar/AnalysisTableBuilder.cpp) diff --git a/grammar.txt b/grammar.txt new file mode 100644 index 0000000..5db88c6 --- /dev/null +++ b/grammar.txt @@ -0,0 +1,11 @@ +ESFGT +()*+i~ +---------(此行做分割用)--------- +E->TG +G->+TG +G->~ +T->FS +S->*FS +S->~ +F->(E) +F->i \ No newline at end of file diff --git a/lexer/Lexer.cpp b/lexer/Lexer.cpp new file mode 100644 index 0000000..d87528b --- /dev/null +++ b/lexer/Lexer.cpp @@ -0,0 +1,217 @@ +// +// Created by lenfrex on 2023/4/13. +// + +#include "Lexer.h" + +#include "Token.h" + +using namespace std; + +Lexer::Lexer(char* code) { + buffer = code; +} + +/* + * 对所有文本进行词法分析 + */ +void Lexer::lexAll() { + while (!stop && *buffer != '\0') { + Token token = Token(); + Lexer::lexToken(token); + results.push_back(token); + } +} + +const vector &Lexer::getResults() const { + return results; +} + +#define isWhitespace(x) (x == ' ' || x == '\t') + +// 仿照Clang编译器前端的Lexer中Lexer::LexTokenInternal方法的实现 +// 经过极度的精简,只是模仿了大致的思路,具体流程也不是按照Clang原版的去实现的 +void Lexer::lexToken(Token &token) { + LexStart: + // 过滤空格等 + if (isWhitespace(*buffer)) { + do { + ++buffer; + ++currPos; + } while (isWhitespace(*buffer)); + } + + switch (*buffer) { + case '\0': + stop = true; + + // 换行,重置相关参数 + case '\r': + buffer++; + currLine++; + currPos = 0; + goto LexStart; + case '\n': + buffer++; + currLine++; + currPos = 0; + goto LexStart; + // 处理小于号和小于号开头的符号 + case '<': { + char next = *(buffer+1); + if (next == '=') { + int length = 2; + Lexer::setToken(token, TokenType::OPERATOR, buffer, length); + return Lexer::move(length); + + // 实际上TokenType其实应该更详细点,例如大于等于号,大于号,逗号这种具体的符号 + // 为简单起见,只分成五大类,况且题目只要求分成五大类( + // 这里加了个"<<"这种符号的解析只不过是为了展示分类具体符号情况下的实现 + } else if (next == '<') { + int length = 2; + Lexer::setToken(token, TokenType::OPERATOR, buffer, length); + return Lexer::move(length); + + } else { + int length = 1; + Lexer::setToken(token, TokenType::OPERATOR, buffer, length); + return Lexer::move(length); + } + } + + case '>': { + char next = *(buffer+1); + if (next == '=') { + int length = 2; + Lexer::setToken(token, TokenType::OPERATOR, buffer, length); + return Lexer::move(length); + + } else { + int length = 1; + Lexer::setToken(token, TokenType::OPERATOR, buffer, length); + return Lexer::move(length); + } + } + + case ':': { + char next = *(buffer+1); + if (next == '=') { + int length = 2; + Lexer::setToken(token, TokenType::OPERATOR, buffer, length); + return Lexer::move(length); + + } else { + int length = 1; + Lexer::setToken(token, TokenType::UNKNOWN, buffer, length); + return Lexer::move(length); + } + } + + // 单符号运算符 + case '+': case '-': case '*': + case '/': case '#': case '=': { + Lexer::setToken(token, TokenType::OPERATOR, buffer, 1); + return Lexer::move(1); + } + + // 界符,句号'.'特别关照 + case '.': +// stop = true; + case '(': case ')': case ',': + case ';': /* '.' */ { + Lexer::setToken(token, TokenType::DELIMITER, buffer, 1); + return Lexer::move(1); + } + + // 数字 + case '0' ... '9': { + return Lexer::lexNumber(token); + } + + // 标识符,为简单起见,这里只接受ascii字母'a'-'z','A'-'Z'以及下划线'_'作为标识符,utf8字符不考虑 + case 'A' ... 'Z': + case 'a' ... 'z': + case '_' : { + return Lexer::lexIdentifier(token); + } + + default: + Lexer::setToken(token, TokenType::UNKNOWN, buffer, 1); + return Lexer::move(1); + } +} + +#define isNumber(x) (x >= '0' && x <= '9') +#define isIllegalIdentifier(x) ((x >= '0' && x <= '9') || \ + (x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || \ + (x == '_')) + +inline void Lexer::lexNumber(Token &token) { + char *next = buffer + 1; + int length = 1; + + while (isNumber(*next)) { + next++; + length++; + } + + Lexer::setToken(token, TokenType::CONSTANTS, buffer, length); + + return Lexer::move(length); +} + +inline void Lexer::lexIdentifier(Token &token) { + // 传进来的时候已经有一个字符了,所以跳过第一个字符 + char *next = buffer + 1; + int length = 1; + + while (isIllegalIdentifier(*next)) { + next++; + length++; + } + + string content = Lexer::copyAsString(buffer, length); + + // 检查是否关键字 + for (const string & keyword : keywords) { + if (content == keyword) { + Lexer::setToken(token, TokenType::KEYWORD, content, length); + return Lexer::move(length); + } + } + + Lexer::setToken(token, TokenType::IDENTIFIER, content, length); + + return Lexer::move(length); +} + +inline string Lexer::copyAsString(const char *sourceStart, int length) { + // 预留一个长度最后一个放置'\0' + char *tmp = new char[length + 1]{0}; + for (int i = 0; i < length; ++i) { + tmp[i] = *(sourceStart + i); + } + + string result; + result.append(tmp); + + delete[] tmp; + return result; +} + +void Lexer::setToken(Token &token, TokenType type, const char *source, int length) { + token.setType(type); + token.setLocation(Location{.line=currLine, .start=currPos, .length=length}); + token.setContent(copyAsString(source, length)); +} + +void Lexer::setToken(Token &token, TokenType type, const string &content, int length) { + token.setType(type); + token.setLocation(Location{.line=currLine, .start=currPos, .length=length}); + token.setContent(content); +} + +void Lexer::move(int offset) { + currPos += offset; + buffer = buffer + offset; +} diff --git a/lexer/Lexer.h b/lexer/Lexer.h new file mode 100644 index 0000000..3cd0e36 --- /dev/null +++ b/lexer/Lexer.h @@ -0,0 +1,45 @@ +// +// Created by lenfrex on 2023/4/13. +// + +#ifndef LEXER_CPP_LEXER_H +#define LEXER_CPP_LEXER_H + +#include +#include +#include "Token.h" + +class Lexer { +public: + explicit Lexer(char *code); + + void lexAll(); + + [[nodiscard]] const std::vector &getResults() const; + +private: + char *buffer; + + std::vector results = std::vector(); + + bool stop = false; + + int currLine = 0; + int currPos = 0; + + void lexNumber(Token &token); + + void lexIdentifier(Token &token); + + static std::string copyAsString(const char *sourceStart, int length); + + void setToken(Token &token, TokenType type, const char *source, int length); + + void setToken(Token &token, TokenType type, const std::string &content, int length); + + void move(int offset); + + void lexToken(Token &token); +}; + +#endif //LEXER_CPP_LEXER_H diff --git a/lexer/Token.cpp b/lexer/Token.cpp new file mode 100644 index 0000000..2f4f8a0 --- /dev/null +++ b/lexer/Token.cpp @@ -0,0 +1,29 @@ +// +// Created by lenfrex on 2023/4/13. +// + +#include "Token.h" + +const Location &Token::getLocation() const { + return location; +} + +void Token::setLocation(const Location &loc) { + Token::location = loc; +} + +TokenType Token::getType() const { + return type; +} + +void Token::setType(TokenType typ) { + Token::type = typ; +} + +const std::string &Token::getContent() const { + return content; +} + +void Token::setContent(const std::string &cont) { + Token::content = cont; +} diff --git a/lexer/Token.h b/lexer/Token.h new file mode 100644 index 0000000..eae3175 --- /dev/null +++ b/lexer/Token.h @@ -0,0 +1,75 @@ +// +// Created by lenfrex on 2023/4/13. +// + +#ifndef LEXER_CPP_TOKEN_H +#define LEXER_CPP_TOKEN_H + +#include + +#define KEYWORD_SIZE 17 + +const std::string keywords[KEYWORD_SIZE] = { + "begin", "end", "call", "const", "do", "length", "if", "odd", + "procedure", "read", "then", "var", "while", "write", + + "Number", "Name", "HeTaiyu", +}; + +const std::string TokenTypeName[] = { + "运算符", + "标识符", + "常数", + "关键字", + "界符", + "未知" +}; + +typedef enum TokenType { + // 运算符 + OPERATOR, + + // 标识符 + IDENTIFIER, + + // 常数 + CONSTANTS, + + // 保留字 + KEYWORD, + + // 界符 + DELIMITER, + + // 未知 + UNKNOWN +} TokenType; + +typedef struct Location { + int line; + int start; + int length; +} Location; + +class Token { +private: + Location location{}; + + TokenType type = UNKNOWN; + std::string content; + +public: + [[nodiscard]] const Location &getLocation() const; + + void setLocation(const Location &loc); + + [[nodiscard]] TokenType getType() const; + + void setType(TokenType typ); + + [[nodiscard]] const std::string &getContent() const; + + void setContent(const std::string &cont); +}; + +#endif //LEXER_CPP_TOKEN_H diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000..2225c53 --- /dev/null +++ b/main.cpp @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include +#include "lexer/Token.h" +#include "lexer/Lexer.h" + +using namespace std; + +//int main() { +//#ifdef __WIN32__ +// system("chcp 65001"); +//#endif +// string a; +//cin >>a; +// fstream file; +// file.open("../learn2.sql", ios::in); +// +// if (!file.is_open()) { +// cout << "打开文件时发生错误"; +// return -1; +// } +// +// stringstream stream; +// stream << file.rdbuf(); +// string str = stream.str(); +// +//// cout << str << endl; +// +// Lexer parser = Lexer(str.data()); +// +// parser.lexAll(); +// +// vector tokens = parser.getResults(); +// +// cout << "-------------------------" << endl; +// cout << "总的Token数量: " << tokens.size() << endl; +// int count = 0; +// for (const auto &token: tokens) { +// cout << "-------------------------" << endl; +// cout << count++ << endl; +// cout << "内容: \t" << token.getContent() << endl; +// cout << "类型: \t" << TokenTypeName[token.getType()] << endl; +// cout << "长度: \t" << token.getLocation().length << endl; +// cout << "位置: \t[行:" << token.getLocation().line << ", 开始: " << token.getLocation().startChar << "]" << endl; +// } +// +// return 0; +//} diff --git a/parser/Checker.cpp b/parser/Checker.cpp new file mode 100644 index 0000000..74c2c00 --- /dev/null +++ b/parser/Checker.cpp @@ -0,0 +1,143 @@ +#include +#include +#include "iostream" + +#include "Checker.h" + +using namespace std; + +#define isEmptyStr(c) (c == '~') + +// 能否推导出空串 +bool Checker::canDeducedEmpty(const set &productions) { + for (const auto &production: productions) { + for (const auto &c: production) { + if (isEmptyStr(c)) { + return true; + } + } + } + + return false; +} + +string Checker::getStackContent(stack source) { + string str; + while (!source.empty()) { + str.push_back(source.top()); + source.pop(); + } + + std::reverse(str.begin(), str.end()); + + return str; +} + +void printTableHeader() { + cout << setw(16) << left << setfill(' ') << "步"; + cout << setw(20) << left << setfill(' ') << "分析栈情况"; + cout << setw(23) << left << setfill(' ') << "待分析字符串"; + cout << setw(25) << left << setfill(' ') << "使用产生式与匹配情况" << endl; +} + +bool Checker::identifyString(std::string input) { + printTableHeader(); + + // 添加#到输入字符串尾部 + input.push_back('#'); + + stack analysisStack = stack(); + + // #先入栈 + analysisStack.push('#'); + + // 开始符号入栈, S + analysisStack.push(grammar.getStartChar()); + + int step = 0; + + string remain = input; + string status = "分析中"; + + cout << setw(16) << left << setfill(' ') << step; + cout << setw(16) << left << setfill(' ') << getStackContent(analysisStack); + cout << setw(16) << left << setfill(' ') << remain; + + const set &terminals = grammar.getTerminals(); + const auto &productionsMap = grammar.getGrammarExpresses(); + const auto &analysisTable = grammar.getAnalysisTable(); + + for (auto pos = input.begin(); pos != input.end();) { + char currentChar = *pos; + char stackTopChar = analysisStack.top(); + + // 如果当前栈顶是终结符,对比当前输入字符和栈顶终结符是否匹配 + // 匹配则进行规约,否则判定不匹配 + if (terminals.contains(stackTopChar)) { + if (currentChar == stackTopChar) { + analysisStack.pop(); + for (const auto &c: remain) { + if (c == currentChar) { + remain = remain.substr(1); + break; + } + } + + status.clear(); + status.push_back('\''); + status.push_back(currentChar); + status += "': 匹配"; + pos++; + } else { + cout << "匹配失败。期望输入:" << stackTopChar; + cout << ",得到输入:" << currentChar << endl; + return false; + } + } else if (stackTopChar == '#') { + return currentChar == '#'; + } else { + /* 对非终结符的处理 */ + // 预测分析表中找不到对应的规则,但是当前的非终止符能推出空串,就退栈,该非终止符作为空串推测处理 + // 否则就判定不接受 + bool foundInTable = analysisTable.at(stackTopChar).contains(currentChar); + if (!foundInTable) { + if (canDeducedEmpty(productionsMap.at(stackTopChar))) { + cout << "找不到" << stackTopChar << "与" << currentChar << "相匹配的规则" << endl; + return false; + } else { + analysisStack.pop(); + status.clear(); + status.push_back(stackTopChar); + status += "->~"; + } + } else { + analysisStack.pop(); + + status.clear(); + status.push_back(stackTopChar); + string production = analysisTable.at(stackTopChar).at(currentChar); + status += "-> " + production; + + for (int i = (int) production.size() - 1; i >= 0; --i) { + char c = production[i]; + if (c != '~') { + analysisStack.push(c); + } + } + } + } + + step++; + + cout << setw(16) << left << setfill(' ') << status << endl; + cout << setw(16) << left << setfill(' ') << step; + cout << setw(16) << left << setfill(' ') << getStackContent(analysisStack); + cout << setw(16) << left << setfill(' ') << remain; + } + + cout << endl; + + return true; +} + +Checker::Checker(Grammar grammar) : grammar(std::move(grammar)){} \ No newline at end of file diff --git a/parser/Checker.h b/parser/Checker.h new file mode 100644 index 0000000..e1dcaa6 --- /dev/null +++ b/parser/Checker.h @@ -0,0 +1,26 @@ +// +// Created by lenfrex on 2023/5/5. +// + +#ifndef LEXER_CPP_CHECKER_H +#define LEXER_CPP_CHECKER_H + +#include "stack" +#include "grammar/Grammar.h" + +class Checker { +private: + Grammar grammar; + + static std::string getStackContent(std::stack source); + +public: + explicit Checker(Grammar grammar); + + bool identifyString(std::string input); + + static bool canDeducedEmpty(const std::set &productions); +}; + + +#endif //LEXER_CPP_CHECKER_H diff --git a/parser/grammar/AnalysisTableBuilder.cpp b/parser/grammar/AnalysisTableBuilder.cpp new file mode 100644 index 0000000..32112a7 --- /dev/null +++ b/parser/grammar/AnalysisTableBuilder.cpp @@ -0,0 +1,58 @@ +// +// Created by lenfrex on 2023/5/8. +// + +#include "Grammar.h" + +using namespace std; + +// 求是否有交集 +bool hasUnion(const set &a, const set &b) { + for (const auto &selectChars: b) { + if (a.contains(selectChars)) { + return true; + } + } + + return false; +} + +// 检查是否为LL1文法 +bool Grammar::isLL1Grammar() { + // 逐个非终止符检查 + for (const auto &nonTerminal: nonTerminals) { + SelectSet select = selectSet[nonTerminal]; + + // 取一行 + for (auto current = select.begin(); current != select.end();) { + const auto &productionSelectSet = current->second; + // 取剩下的行进行比较,注意这里已经++current了,外层for就不要++了 + for (auto innerCurrent = ++current; innerCurrent != select.end(); innerCurrent++) { + if (hasUnion(productionSelectSet, innerCurrent->second)) { + return false; + } + } + } + } + + return true; +} + +void Grammar::buildAnalysisTable() { + if (!isLL1Grammar()) { + throw NotSupportedGrammarException("文法定义不符合LL1文法"); + } + + for (const auto &nonTerminal: nonTerminals) { + SelectSet select = selectSet[nonTerminal]; + for (const auto &selectPair: select) { + const auto &prod = selectPair.first; + const auto &selectChars = selectPair.second; + + // 填充预测分析表 + for (const auto &selectableTerminal: selectChars) { + analysisTable[nonTerminal][selectableTerminal] = prod; + } + } + } +} \ No newline at end of file diff --git a/parser/grammar/First.cpp b/parser/grammar/First.cpp new file mode 100644 index 0000000..889e3df --- /dev/null +++ b/parser/grammar/First.cpp @@ -0,0 +1,53 @@ +// +// Created by lenfrex on 2023/5/7. +// + +#include "Grammar.h" + +using namespace std; + +void Grammar::generateFirstSet() { + // 处理每一个非终止符 + for (char nonTerminal: nonTerminals) { + calcFirstSetForNonTerminal(nonTerminal); + } +} + +void Grammar::calcFirstSetForNonTerminal(char nonTerminal) { + if (!productionsMap.contains(nonTerminal)) { + return; + } + + const set &productions = productionsMap.at(nonTerminal); + // 处理非终止符的每个产生式 + for (const auto &production: productions) { + for (char currentChar: production) { + // 如果这个字符是终止符那就直接加到终止符集合里面,并退出逐字符解析 + if (terminals.contains(currentChar)) { + firstSet[nonTerminal].insert(currentChar); + break; + } + + // 该非终止符是否已经有first集合了,有的话就全部加到里面去 + // 否则递归处理,先计算这个非终止符的first + ProcessNonTerminal: + if (firstSet.contains(currentChar)) { + firstSet[nonTerminal].insert(firstSet[currentChar].begin(), firstSet[currentChar].end()); + } else { + // 如果左递归了,就先处理其他的表达式,但是这个时候其实已经不算是LL1文法了 + if (currentChar == nonTerminal) { + break; + } + calcFirstSetForNonTerminal(currentChar); + // 递归完还要回来继续处理之前待处理的非终止符,如S->A..., A->a, 递归处理完A后还要返回继续处理S + goto ProcessNonTerminal; + } + + // 如果这个非终止符不会推出空字符,就直接退出解析 + // 能推出空字符就继续 + if (!canDeducedEmpty(productionsMap.at(currentChar))) { + break; + } + } + } +} \ No newline at end of file diff --git a/parser/grammar/Follow.cpp b/parser/grammar/Follow.cpp new file mode 100644 index 0000000..e8da2cb --- /dev/null +++ b/parser/grammar/Follow.cpp @@ -0,0 +1,187 @@ +// +// Created by lenfrex on 2023/5/7. +// + +#include "Grammar.h" + +using namespace std; + +#define isEmptyStr(c) (c == '~') + +void Grammar::generateFollowSet() { + this->followPreProcess(); + + // 不断补充follow集,直到总的follow集大小不会再变大为止 + int beforeSize = getTotalFollowSize(); + for (int afterSize = -1; afterSize != beforeSize; afterSize = getTotalFollowSize()) { + beforeSize = getTotalFollowSize(); + this->followFinalProcess(); + } +} + +int Grammar::getTotalFollowSize() { + int count = 0; + for (const auto &followPair: followSet) { + count += (int) followPair.second.size(); + } + + return count; +} + +void searchForProduction(const Grammar &grammar, + const string &production, + std::map> &followSet); + +void Grammar::followPreProcess() { + // 获取每一个非终止符,以获得所有的产生式 + for (const auto &nonTerminal: nonTerminals) { + // 开始符号先加个'#'结束符 + if (nonTerminal == startChar) { + followSet[nonTerminal].insert('#'); + } + + if (!productionsMap.contains(nonTerminal)) { + continue; + } + + const set &productions = productionsMap.at(nonTerminal); + + // 对当前非终止符的所有的产生式进行解析,搜索所有的非终止符进行求follow + for (const auto &production: productions) { + searchForProduction(*this, production, this->followSet); + } + } +} + +enum LoopStatus { + Finish, Continue +}; + +LoopStatus processNonTerminalFollow(const Grammar &grammar, + const string &production, const auto &currPos, + std::map> &followSet); + +void searchForProduction(const Grammar &grammar, + const string &production, + std::map> &followSet) { + + const auto &nonTerminals = grammar.getNonTerminals(); + + for (auto currPos = production.begin(); currPos != production.end(); currPos++) { + char currChar = *currPos; + + // 如果当前的符号不是非终止符,就继续解析下一个字符, + // 直到遇到的是非终止符 + if (!nonTerminals.contains(currChar)) { + continue; + } + + LoopStatus result = processNonTerminalFollow(grammar, production, currPos, followSet); + switch (result) { + case Continue: + continue; + case Finish: + break; + } + } +} + +LoopStatus processNonTerminalFollow(const Grammar &grammar, + const string &production, const auto &currPos, + std::map> &followSet) { + + const auto &terminals = grammar.getTerminals(); + const auto &productionsMap = grammar.getProductionsMap(); + const auto &firstSet = grammar.getFirstSet(); + + auto next = currPos + 1; + char currChar = *currPos; + + ProcessNoneTerminal: + // 到了产生式尾部就直接添加结束符 + if (next == production.end()) { + followSet[currChar].insert('#'); + return Finish; + } + + // 如果下一个字符就是终结符的话,直接加到结果里边去,然后继续处理后面可能会遇到的非终止符 + // 如果下一个字符也是非终止符,把下一个非终止符的firs去掉空串之后加到当前的follow中 + if (terminals.contains(*next)) { + followSet[currChar].insert(*next); + return Continue; + } else { + set addFirst = firstSet.at(*next); + addFirst.erase('~'); + followSet[currChar].insert(addFirst.begin(), addFirst.end()); + + // 如果下一个能推导出空串,那就还要继续找后面非终止符,知道非终止符不能推空为止 + if (Grammar::canDeducedEmpty(productionsMap.at(*next))) { + next++; + goto ProcessNoneTerminal; + } else { + return Finish; + } + } +} + +LoopStatus finalProcessNonTerminalFollow(const Grammar &grammar, char nonTerminal, + const string &production, const auto &currPos, + std::map> &followSet); + +void Grammar::followFinalProcess() { + // 处理每一个非终止符 + for (char nonTerminal: nonTerminals) { + if (!productionsMap.contains(nonTerminal)) { + continue; + } + + const set &productions = productionsMap.at(nonTerminal); + + // 对所有的产生式进行解析,搜索所有的非终止符进行求follow + for (const auto &production: productions) { + for (auto currPos = production.begin(); currPos != production.end(); currPos++) { + LoopStatus result = finalProcessNonTerminalFollow(*this, nonTerminal, production, currPos, this->followSet); + switch (result) { + case Continue: + continue; + case Finish: + break; + } + } + } + + } +} + +LoopStatus finalProcessNonTerminalFollow( + const Grammar &grammar, char nonTerminal, + const string &production, const auto &currPos, + std::map> &followSet) { + + char currChar = *currPos; + + const auto &productionsMap = grammar.getProductionsMap(); + const auto &nonTerminals = grammar.getNonTerminals(); + + // 如果当前的符号不是非终止符,就继续解析下一个字符, + // 直到遇到的是非终止符 + if (!nonTerminals.contains(*currPos)) { + return Continue; + } + + // 当前字符是当前产生式的尾部,添加产生式左部的follow进这个字符的follow中 + auto next = currPos + 1; + if (next == production.end()) { + followSet[currChar].insert(followSet[nonTerminal].begin(), followSet[nonTerminal].end()); + return Continue; + } + + // 如果当前字符不是产生式尾部但是下一个字符是非终结符并且位于尾部,而且还能推空,也把产生式左部的follow添加进这个字符的follow中 + auto nextNext = next + 1; + if (nextNext == production.end() && nonTerminals.contains(*next)) { + if (Grammar::canDeducedEmpty(productionsMap.at(*next))) { + followSet[currChar].insert(followSet[nonTerminal].begin(), followSet[nonTerminal].end()); + return Continue; + } + } +} \ No newline at end of file diff --git a/parser/grammar/Grammar.cpp b/parser/grammar/Grammar.cpp new file mode 100644 index 0000000..0fca1b4 --- /dev/null +++ b/parser/grammar/Grammar.cpp @@ -0,0 +1,234 @@ +// +// Created by lenfrex on 2023/5/7. +// + +#include +#include "Grammar.h" + +using namespace std; + +const set &Grammar::getNonTerminals() const { + return nonTerminals; +} + +const set &Grammar::getTerminals() const { + return terminals; +} + +#define isBlankSpace(c) (c == ' ') + +Grammar::Grammar(const string &nonTerminalString, const string &terminalString, + const set &productionTexts, char startChar) : startChar(startChar) { + for (char c: nonTerminalString) { + nonTerminals.insert(c); + } + + for (char c: terminalString) { + terminals.insert(c); + } + + // 解析每条产生式 + for (const string &prodText: productionTexts) { + char nonTerminal = '\0'; + for (char c: prodText) { + if (isBlankSpace(c)) { + continue; + } else { + nonTerminal = c; + break; + } + } + + string::size_type start = prodText.find("->"); + + // 输入的产生式有误,跳过该条产生式 + if (start == -1) { + break; + } + + for (string::size_type i = start + 2; i < prodText.length(); ++i) { + // 跳过空格 + if (isBlankSpace(prodText[i])) { + continue; + } + + productionsMap[nonTerminal].insert(prodText.substr(i)); + break; + } + } + + generateFirstSet(); + generateFollowSet(); + generateSelectSet(); + + this->ll1Grammar = isLL1Grammar(); + if (ll1Grammar) { + buildAnalysisTable(); + } +} + +const map> &Grammar::getGrammarExpresses() const { + return productionsMap; +} + +const char emptyChar = '~'; + +#define isEmptyStr(c) (c == '~') + +// 是否可以推导出空字符 +bool Grammar::canDeducedEmpty(const set &productions) { + for (const auto &production: productions) { + for (const auto &c: production) { + if (isEmptyStr(c)) { + return true; + } + } + } + + return false; +} + +ostream &alignPrint(ostream &os, const string &str) { + os << setw(8) << left << setfill(' ') << str; + return os; +} + +ostream &alignPrint(ostream &os, char str) { + os << setw(8) << left << setfill(' ') << str; + return os; +} + +ostream &operator<<(ostream &os, const Grammar &grammar) { + const auto &terminals = grammar.getTerminals(); + const auto &nonTerminals = grammar.getNonTerminals(); + + os << "文法符号详情:" << endl; + + os << "开始符号:" << grammar.getStartChar() << endl; + + os << "终止符:Vt = {"; + for (const auto &c: terminals) { + os << '\'' << c << '\'' << ", "; + } + os << "}" << endl; + + os << "非终止符:Vn = {"; + for (const auto &c: nonTerminals) { + os << '\'' << c << '\'' << ", "; + } + os << "}" << endl; + + os << "================================" << endl; + os << " FirstSet " << endl; + os << "================================" << endl; + + const auto &firstSet = grammar.getFirstSet(); + for (const auto &nonTerminal: nonTerminals) { + os << "First(" << nonTerminal << ") = {"; + const auto &firstChars = firstSet.at(nonTerminal); + for (const auto &firstChar: firstChars) { + os << '\'' << firstChar << '\'' << ", "; + } + os << "}" << endl; + } + + os << "================================" << endl; + os << " FollowSet " << endl; + os << "================================" << endl; + + const auto &followSet = grammar.getFollowSet(); + for (const auto &nonTerminal: nonTerminals) { + os << "Follow(" << nonTerminal << ") = {"; + const auto &followChars = followSet.at(nonTerminal); + for (const auto &followChar: followChars) { + os << '\'' << followChar << '\'' << ", "; + } + os << "}" << endl; + } + + os << "================================" << endl; + os << " SelectSet " << endl; + os << "================================" << endl; + + const auto &allSelectSet = grammar.getSelectSet(); + for (const auto &nonTerminal: nonTerminals) { + const auto &nonTerminalSelectSet = allSelectSet.at(nonTerminal); + for (const auto &selectPair: nonTerminalSelectSet) { + os << "Select(" << nonTerminal << "->" << selectPair.first << ')' << "\t=\t" << "{"; + for (const auto &selectChar: selectPair.second) { + os << '\'' << selectChar << '\'' << ", "; + } + os << "}" << endl; + } + } + + os << "================================" << endl; + os << " AnalysisTable " << endl; + os << "================================" << endl; + + alignPrint(os, "Vn/Vt"); + os << "| "; + for (const auto &terminal: terminals) { + if (terminal != '~') { + alignPrint(os, terminal); + } + } + alignPrint(os, '#'); + os << endl; + + for (int i = 0; i < terminals.size(); ++i) { + os << "----------"; + } + os << endl; + + const auto &analysisTable = grammar.getAnalysisTable(); + for (const auto &nonTerminal: nonTerminals) { + alignPrint(os, nonTerminal); + os << "| "; + + for (const auto &terminal: terminals) { + if (terminal == '~') { + break; + } + bool found = analysisTable.contains(nonTerminal) && analysisTable.at(nonTerminal).contains(terminal); + string printText = found ? ("->" + analysisTable.at(nonTerminal).at(terminal)): "[/]"; + alignPrint(os, printText); + } + + bool found = analysisTable.contains(nonTerminal) && analysisTable.at(nonTerminal).contains('#'); + string printText = found ? ("->" + analysisTable.at(nonTerminal).at('#')): "[/]"; + alignPrint(os, printText); + + os << endl; + } + + return os; +} + +char Grammar::getStartChar() const { + return startChar; +} + +const map> &Grammar::getProductionsMap() const { + return productionsMap; +} + +const map> &Grammar::getFirstSet() const { + return firstSet; +} + +const map> &Grammar::getFollowSet() const { + return followSet; +} + +const map &Grammar::getSelectSet() const { + return selectSet; +} + +const map &Grammar::getAnalysisTable() const { + return analysisTable; +} + +bool Grammar::isLl1Grammar() const { + return ll1Grammar; +} \ No newline at end of file diff --git a/parser/grammar/Grammar.h b/parser/grammar/Grammar.h new file mode 100644 index 0000000..15ce0ce --- /dev/null +++ b/parser/grammar/Grammar.h @@ -0,0 +1,119 @@ +// +// Created by lenfrex on 2023/5/7. +// + +#ifndef LEXER_CPP_GRAMMAR_H +#define LEXER_CPP_GRAMMAR_H + +#include +#include "vector" +#include "string" +#include "set" +#include "map" + +typedef std::string Express; + +// 单个产生式的select集合 +typedef std::pair> ExpressSelectSet; + +// 多个产生式的Select集合set,同一个非终止符的所有推导式的select集合 +typedef std::set SelectSet; + +typedef std::map TableRow; + +// 文法定义 +class Grammar { +private: + + char startChar; + + // 非终止符,vn + std::set nonTerminals = std::set();; + + // 终止符,vt + std::set terminals = std::set();; + + // 文法产生式 + std::map> productionsMap; + + std::map> firstSet = std::map>(); + std::map> followSet = std::map>(); + std::map selectSet = std::map(); + + std::map analysisTable = std::map>(); + + bool ll1Grammar = false; + + void calcFirstSetForNonTerminal(char nonTerminal); + + // 计算first集合 + void generateFirstSet(); + + // 获取当前总的follow集合元素数量 + int getTotalFollowSize(); + + // 计算follow集合 + void generateFollowSet(); + + // follow集合预处理 + void followPreProcess(); + + // follow集合完善处理 + void followFinalProcess(); + + // 计算select集合 + void generateSelectSet(); + + // 检查当前是否为LL1文法 + bool isLL1Grammar(); + + // 构建预测分析表 + void buildAnalysisTable(); + +public: + [[nodiscard]] const std::set &getNonTerminals() const; + + [[nodiscard]] const std::set &getTerminals() const; + + [[nodiscard]] const std::map> &getGrammarExpresses() const; + + Grammar(const std::string &nonTerminalString, const std::string &terminalString, + const std::set &productionTexts, char startChar); + + [[nodiscard]] char getStartChar() const; + + [[nodiscard]] const std::map> &getProductionsMap() const; + + [[nodiscard]] const std::map> &getFirstSet() const; + + [[nodiscard]] const std::map> &getFollowSet() const; + + [[nodiscard]] const std::map &getSelectSet() const; + + [[nodiscard]] const std::map &getAnalysisTable() const; + + [[nodiscard]] bool isLl1Grammar() const; + + friend std::ostream &operator<<(std::ostream &os, const Grammar &grammar); + + static bool canDeducedEmpty(const std::set &productions); +}; + +// 文法不支持异常 +class NotSupportedGrammarException : public std::exception { +private: + const std::string msg; +public: + [[nodiscard]] const char *what() const _GLIBCXX_TXN_SAFE_DYN _GLIBCXX_NOTHROW override { + return msg.data(); + }; + +public: + explicit NotSupportedGrammarException(std::string msg) : msg(std::move(msg)) {}; + + [[nodiscard]] const std::string &getMsg() const { + return msg; + }; +}; + +#endif //LEXER_CPP_GRAMMAR_H diff --git a/parser/grammar/Select.cpp b/parser/grammar/Select.cpp new file mode 100644 index 0000000..cfdbad6 --- /dev/null +++ b/parser/grammar/Select.cpp @@ -0,0 +1,72 @@ +// +// Created by lenfrex on 2023/5/8. +// + +#include "Grammar.h" + +using namespace std; + +#define isEmptyStr(c) (c == '~') + +void Grammar::generateSelectSet() { + // 处理每一个非终止符,以取得产生式 + for (const auto &leftNonTerminal: nonTerminals) { + if (!productionsMap.contains(leftNonTerminal)) { + continue; + } + + const set &productions = productionsMap.at(leftNonTerminal); + + // 解析非终止符的每个产生式 + for (const auto &production: productions) { + ExpressSelectSet productionSelectSet = ExpressSelectSet(); + productionSelectSet.first = production; + productionSelectSet.second = set(); + + // 解析产生式,一个一个字符处理 + for (auto currPos = production.begin(); currPos != production.end(); currPos++) { + char currChar = *currPos; + + // 如果当前字符是终止符 + // - 是空串,把产生式左部非终止符(即A->a...中的A, 此处即为leftNonTerminal)的follow加到当前产生式的select中(productionSelectSet); + // - 不是空串,就直接把这个符号加入到当前产生式的select + // + // 如果是非终结符把,则把当前字符的first删去空串之后加入当前产生式的select中 + // - 当前非终止符可以推出空串,就继续往后解析处理当前非终止符为空时的情况; + // - 如果是最后一个字符,把产生式左部follow加当前产生式的select中; + // - 如果不是最后一个字符,则继续看下一个字符(continue) + // - 不能推出空串,说明该条产生式select集合的计算结束(break) + if (terminals.contains(currChar)) { + if (isEmptyStr(currChar)) { + productionSelectSet.second.insert(followSet[leftNonTerminal].begin(), followSet[leftNonTerminal].end()); + } else { + productionSelectSet.second.insert(currChar); + } + break; + } else { + // 前字符的first删去空串之后加入到产生式的select中 + set currFirst = firstSet[currChar]; + currFirst.erase('~'); + productionSelectSet.second.insert(currFirst.begin(), currFirst.end()); + + // 如果当前字符能推出空串,就看看下一个字符怎么样 + // 不能推出空串就break退出 + if (canDeducedEmpty(productionsMap.at(currChar))) { + // 如果现在的这个字符已经是产生式末尾了,就把产生式左部follow加进去 + // 否则转到下一个字符 + auto next = currPos + 1; + if (next == production.end()) { + productionSelectSet.second.insert(followSet[leftNonTerminal].begin(), followSet[leftNonTerminal].end()); + } else { + continue; + } + } else { + break; + } + } + } + + selectSet[leftNonTerminal].insert(productionSelectSet); + } + } +} \ No newline at end of file diff --git a/parser/test.cpp b/parser/test.cpp new file mode 100644 index 0000000..1e3b5fc --- /dev/null +++ b/parser/test.cpp @@ -0,0 +1,80 @@ +// +// Created by lenfrex on 2023/5/7. +// + +#include +#include +#include "iostream" +#include "grammar/Grammar.h" +#include "Checker.h" + +using namespace std; + +int main() { +#ifdef __WIN32__ + system("chcp 65001"); + system("cls"); +#endif + + fstream file; + + file.open("../grammar.txt", ios::in); + + if (!file.is_open()) { + cout << "打开文件时发生错误"; + return -1; + } + + // 读非终止符行 + string nonTerminals; + getline(file, nonTerminals); + + // 读终止符行 + string terminals; + getline(file, terminals); + + cout << "读入非终止符:\t" << nonTerminals << endl; + cout << "读入终止符:\t" << terminals << endl; + + string str; + set grammarExpresses = set(); + + // 跳过中间用来分隔的一行 + getline(file, str); + while (getline(file, str)) { + grammarExpresses.insert(str); + cout << "读入文法产生式:\t" << str << endl; + } + + cout << "--------------------------------" << endl; + + char startChar = nonTerminals.at(0); + + Grammar grammar = Grammar(nonTerminals, terminals, grammarExpresses, startChar); + cout << grammar << endl; + + cout << "--------------------------------" << endl; + + try { + Checker checker = Checker(grammar); + string input; + cout << "输入待分析串:"; + cin >> input; + cout << "分析:" << input << endl; + + cout << "----------------------------------------------------------------" << endl; + + bool accepted = checker.identifyString(input); + if (!accepted) { + cout << "输入串未接受" << endl; + } else { + cout << "输入串接受" << endl; + } + } catch (NotSupportedGrammarException &e) { + cout << e.getMsg() << endl; + cout << "请检查文法定义是否符合LL1文法规则。" << endl; + return -1; + } + + return 0; +} \ No newline at end of file diff --git a/testCode.txt b/testCode.txt new file mode 100644 index 0000000..e69de29