main
lensfrex 2 years ago
commit 4e408f29f4
Signed by: lensfrex
GPG Key ID: 0F69A0A2FBEE98A0
  1. 28
      .gitignore
  2. 8
      .idea/.gitignore
  3. 5
      .idea/codeStyles/codeStyleConfig.xml
  4. 14
      .idea/deployment.xml
  5. 2
      .idea/lexer_cpp.iml
  6. 4
      .idea/misc.xml
  7. 8
      .idea/modules.xml
  8. 12
      .idea/statistic.xml
  9. 6
      CMakeLists.txt
  10. 11
      grammar.txt
  11. 217
      lexer/Lexer.cpp
  12. 45
      lexer/Lexer.h
  13. 29
      lexer/Token.cpp
  14. 75
      lexer/Token.h
  15. 50
      main.cpp
  16. 143
      parser/Checker.cpp
  17. 26
      parser/Checker.h
  18. 58
      parser/grammar/AnalysisTableBuilder.cpp
  19. 53
      parser/grammar/First.cpp
  20. 187
      parser/grammar/Follow.cpp
  21. 234
      parser/grammar/Grammar.cpp
  22. 119
      parser/grammar/Grammar.h
  23. 72
      parser/grammar/Select.cpp
  24. 80
      parser/test.cpp
  25. 0
      testCode.txt

28
.gitignore vendored

@ -0,0 +1,28 @@
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# CMake
cmake-build-*/
out/
.vs/

8
.idea/.gitignore vendored

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

@ -0,0 +1,5 @@
<component name="ProjectCodeStyleConfiguration">
<state>
<option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
</state>
</component>

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" remoteFilesAllowedToDisappearOnAutoupload="false">
<serverData>
<paths name="remote_linux (563cc454-127a-4c53-a847-5bf281087bb1)">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
</component>
</project>

@ -0,0 +1,2 @@
<?xml version="1.0" encoding="UTF-8"?>
<module classpath="CMake" type="CPP_MODULE" version="4" />

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CMakeWorkspace" PROJECT_DIR="$PROJECT_DIR$" />
</project>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/lexer_cpp.iml" filepath="$PROJECT_DIR$/.idea/lexer_cpp.iml" />
</modules>
</component>
</project>

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Statistic">
<option name="excludedDirectories">
<list>
<option value="$PROJECT_DIR$/cmake-build-debug" />
<option value="$PROJECT_DIR$/cmake-build-debug-wsl" />
<option value="$PROJECT_DIR$/.idea" />
</list>
</option>
</component>
</project>

@ -0,0 +1,6 @@
cmake_minimum_required(VERSION 3.21)
project(lexer_cpp)
set(CMAKE_CXX_STANDARD 23)
add_executable(lexer_cpp main.cpp lexer/Lexer.cpp lexer/Lexer.h lexer/Token.cpp lexer/Token.h parser/Checker.cpp parser/Checker.h parser/Checker.h parser/grammar/First.cpp parser/grammar/Grammar.cpp parser/grammar/Grammar.h parser/test.cpp parser/grammar/Follow.cpp parser/grammar/Select.cpp parser/grammar/AnalysisTableBuilder.cpp)

@ -0,0 +1,11 @@
ESFGT
()*+i~
---------(此行做分割用)---------
E->TG
G->+TG
G->~
T->FS
S->*FS
S->~
F->(E)
F->i

@ -0,0 +1,217 @@
//
// Created by lenfrex on 2023/4/13.
//
#include "Lexer.h"
#include "Token.h"
using namespace std;
Lexer::Lexer(char* code) {
buffer = code;
}
/*
*
*/
void Lexer::lexAll() {
while (!stop && *buffer != '\0') {
Token token = Token();
Lexer::lexToken(token);
results.push_back(token);
}
}
const vector<Token> &Lexer::getResults() const {
return results;
}
#define isWhitespace(x) (x == ' ' || x == '\t')
// 仿照Clang编译器前端的Lexer中Lexer::LexTokenInternal方法的实现
// 经过极度的精简,只是模仿了大致的思路,具体流程也不是按照Clang原版的去实现的
void Lexer::lexToken(Token &token) {
LexStart:
// 过滤空格等
if (isWhitespace(*buffer)) {
do {
++buffer;
++currPos;
} while (isWhitespace(*buffer));
}
switch (*buffer) {
case '\0':
stop = true;
// 换行,重置相关参数
case '\r':
buffer++;
currLine++;
currPos = 0;
goto LexStart;
case '\n':
buffer++;
currLine++;
currPos = 0;
goto LexStart;
// 处理小于号和小于号开头的符号
case '<': {
char next = *(buffer+1);
if (next == '=') {
int length = 2;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
// 实际上TokenType其实应该更详细点,例如大于等于号,大于号,逗号这种具体的符号
// 为简单起见,只分成五大类,况且题目只要求分成五大类(
// 这里加了个"<<"这种符号的解析只不过是为了展示分类具体符号情况下的实现
} else if (next == '<') {
int length = 2;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
} else {
int length = 1;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
}
}
case '>': {
char next = *(buffer+1);
if (next == '=') {
int length = 2;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
} else {
int length = 1;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
}
}
case ':': {
char next = *(buffer+1);
if (next == '=') {
int length = 2;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
} else {
int length = 1;
Lexer::setToken(token, TokenType::UNKNOWN, buffer, length);
return Lexer::move(length);
}
}
// 单符号运算符
case '+': case '-': case '*':
case '/': case '#': case '=': {
Lexer::setToken(token, TokenType::OPERATOR, buffer, 1);
return Lexer::move(1);
}
// 界符,句号'.'特别关照
case '.':
// stop = true;
case '(': case ')': case ',':
case ';': /* '.' */ {
Lexer::setToken(token, TokenType::DELIMITER, buffer, 1);
return Lexer::move(1);
}
// 数字
case '0' ... '9': {
return Lexer::lexNumber(token);
}
// 标识符,为简单起见,这里只接受ascii字母'a'-'z','A'-'Z'以及下划线'_'作为标识符,utf8字符不考虑
case 'A' ... 'Z':
case 'a' ... 'z':
case '_' : {
return Lexer::lexIdentifier(token);
}
default:
Lexer::setToken(token, TokenType::UNKNOWN, buffer, 1);
return Lexer::move(1);
}
}
#define isNumber(x) (x >= '0' && x <= '9')
#define isIllegalIdentifier(x) ((x >= '0' && x <= '9') || \
(x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || \
(x == '_'))
inline void Lexer::lexNumber(Token &token) {
char *next = buffer + 1;
int length = 1;
while (isNumber(*next)) {
next++;
length++;
}
Lexer::setToken(token, TokenType::CONSTANTS, buffer, length);
return Lexer::move(length);
}
inline void Lexer::lexIdentifier(Token &token) {
// 传进来的时候已经有一个字符了,所以跳过第一个字符
char *next = buffer + 1;
int length = 1;
while (isIllegalIdentifier(*next)) {
next++;
length++;
}
string content = Lexer::copyAsString(buffer, length);
// 检查是否关键字
for (const string & keyword : keywords) {
if (content == keyword) {
Lexer::setToken(token, TokenType::KEYWORD, content, length);
return Lexer::move(length);
}
}
Lexer::setToken(token, TokenType::IDENTIFIER, content, length);
return Lexer::move(length);
}
inline string Lexer::copyAsString(const char *sourceStart, int length) {
// 预留一个长度最后一个放置'\0'
char *tmp = new char[length + 1]{0};
for (int i = 0; i < length; ++i) {
tmp[i] = *(sourceStart + i);
}
string result;
result.append(tmp);
delete[] tmp;
return result;
}
void Lexer::setToken(Token &token, TokenType type, const char *source, int length) {
token.setType(type);
token.setLocation(Location{.line=currLine, .start=currPos, .length=length});
token.setContent(copyAsString(source, length));
}
void Lexer::setToken(Token &token, TokenType type, const string &content, int length) {
token.setType(type);
token.setLocation(Location{.line=currLine, .start=currPos, .length=length});
token.setContent(content);
}
void Lexer::move(int offset) {
currPos += offset;
buffer = buffer + offset;
}

@ -0,0 +1,45 @@
//
// Created by lenfrex on 2023/4/13.
//
#ifndef LEXER_CPP_LEXER_H
#define LEXER_CPP_LEXER_H
#include <string>
#include <vector>
#include "Token.h"
class Lexer {
public:
explicit Lexer(char *code);
void lexAll();
[[nodiscard]] const std::vector<Token> &getResults() const;
private:
char *buffer;
std::vector<Token> results = std::vector<Token>();
bool stop = false;
int currLine = 0;
int currPos = 0;
void lexNumber(Token &token);
void lexIdentifier(Token &token);
static std::string copyAsString(const char *sourceStart, int length);
void setToken(Token &token, TokenType type, const char *source, int length);
void setToken(Token &token, TokenType type, const std::string &content, int length);
void move(int offset);
void lexToken(Token &token);
};
#endif //LEXER_CPP_LEXER_H

@ -0,0 +1,29 @@
//
// Created by lenfrex on 2023/4/13.
//
#include "Token.h"
const Location &Token::getLocation() const {
return location;
}
void Token::setLocation(const Location &loc) {
Token::location = loc;
}
TokenType Token::getType() const {
return type;
}
void Token::setType(TokenType typ) {
Token::type = typ;
}
const std::string &Token::getContent() const {
return content;
}
void Token::setContent(const std::string &cont) {
Token::content = cont;
}

@ -0,0 +1,75 @@
//
// Created by lenfrex on 2023/4/13.
//
#ifndef LEXER_CPP_TOKEN_H
#define LEXER_CPP_TOKEN_H
#include <string>
#define KEYWORD_SIZE 17
const std::string keywords[KEYWORD_SIZE] = {
"begin", "end", "call", "const", "do", "length", "if", "odd",
"procedure", "read", "then", "var", "while", "write",
"Number", "Name", "HeTaiyu",
};
const std::string TokenTypeName[] = {
"运算符",
"标识符",
"常数",
"关键字",
"界符",
"未知"
};
typedef enum TokenType {
// 运算符
OPERATOR,
// 标识符
IDENTIFIER,
// 常数
CONSTANTS,
// 保留字
KEYWORD,
// 界符
DELIMITER,
// 未知
UNKNOWN
} TokenType;
typedef struct Location {
int line;
int start;
int length;
} Location;
class Token {
private:
Location location{};
TokenType type = UNKNOWN;
std::string content;
public:
[[nodiscard]] const Location &getLocation() const;
void setLocation(const Location &loc);
[[nodiscard]] TokenType getType() const;
void setType(TokenType typ);
[[nodiscard]] const std::string &getContent() const;
void setContent(const std::string &cont);
};
#endif //LEXER_CPP_TOKEN_H

@ -0,0 +1,50 @@
#include <string>
#include <iostream>
#include <vector>
#include <fstream>
#include <sstream>
#include "lexer/Token.h"
#include "lexer/Lexer.h"
using namespace std;
//int main() {
//#ifdef __WIN32__
// system("chcp 65001");
//#endif
// string a;
//cin >>a;
// fstream file;
// file.open("../learn2.sql", ios::in);
//
// if (!file.is_open()) {
// cout << "打开文件时发生错误";
// return -1;
// }
//
// stringstream stream;
// stream << file.rdbuf();
// string str = stream.str();
//
//// cout << str << endl;
//
// Lexer parser = Lexer(str.data());
//
// parser.lexAll();
//
// vector<Token> tokens = parser.getResults();
//
// cout << "-------------------------" << endl;
// cout << "总的Token数量: " << tokens.size() << endl;
// int count = 0;
// for (const auto &token: tokens) {
// cout << "-------------------------" << endl;
// cout << count++ << endl;
// cout << "内容: \t" << token.getContent() << endl;
// cout << "类型: \t" << TokenTypeName[token.getType()] << endl;
// cout << "长度: \t" << token.getLocation().length << endl;
// cout << "位置: \t[行:" << token.getLocation().line << ", 开始: " << token.getLocation().startChar << "]" << endl;
// }
//
// return 0;
//}

@ -0,0 +1,143 @@
#include <iomanip>
#include <utility>
#include "iostream"
#include "Checker.h"
using namespace std;
#define isEmptyStr(c) (c == '~')
// 能否推导出空串
bool Checker::canDeducedEmpty(const set<string> &productions) {
for (const auto &production: productions) {
for (const auto &c: production) {
if (isEmptyStr(c)) {
return true;
}
}
}
return false;
}
string Checker::getStackContent(stack<char> source) {
string str;
while (!source.empty()) {
str.push_back(source.top());
source.pop();
}
std::reverse(str.begin(), str.end());
return str;
}
void printTableHeader() {
cout << setw(16) << left << setfill(' ') << "";
cout << setw(20) << left << setfill(' ') << "分析栈情况";
cout << setw(23) << left << setfill(' ') << "待分析字符串";
cout << setw(25) << left << setfill(' ') << "使用产生式与匹配情况" << endl;
}
bool Checker::identifyString(std::string input) {
printTableHeader();
// 添加#到输入字符串尾部
input.push_back('#');
stack<char> analysisStack = stack<char>();
// #先入栈
analysisStack.push('#');
// 开始符号入栈, S
analysisStack.push(grammar.getStartChar());
int step = 0;
string remain = input;
string status = "分析中";
cout << setw(16) << left << setfill(' ') << step;
cout << setw(16) << left << setfill(' ') << getStackContent(analysisStack);
cout << setw(16) << left << setfill(' ') << remain;
const set<char> &terminals = grammar.getTerminals();
const auto &productionsMap = grammar.getGrammarExpresses();
const auto &analysisTable = grammar.getAnalysisTable();
for (auto pos = input.begin(); pos != input.end();) {
char currentChar = *pos;
char stackTopChar = analysisStack.top();
// 如果当前栈顶是终结符,对比当前输入字符和栈顶终结符是否匹配
// 匹配则进行规约,否则判定不匹配
if (terminals.contains(stackTopChar)) {
if (currentChar == stackTopChar) {
analysisStack.pop();
for (const auto &c: remain) {
if (c == currentChar) {
remain = remain.substr(1);
break;
}
}
status.clear();
status.push_back('\'');
status.push_back(currentChar);
status += "': 匹配";
pos++;
} else {
cout << "匹配失败。期望输入:" << stackTopChar;
cout << ",得到输入:" << currentChar << endl;
return false;
}
} else if (stackTopChar == '#') {
return currentChar == '#';
} else {
/* 对非终结符的处理 */
// 预测分析表中找不到对应的规则,但是当前的非终止符能推出空串,就退栈,该非终止符作为空串推测处理
// 否则就判定不接受
bool foundInTable = analysisTable.at(stackTopChar).contains(currentChar);
if (!foundInTable) {
if (canDeducedEmpty(productionsMap.at(stackTopChar))) {
cout << "找不到" << stackTopChar << "" << currentChar << "相匹配的规则" << endl;
return false;
} else {
analysisStack.pop();
status.clear();
status.push_back(stackTopChar);
status += "->~";
}
} else {
analysisStack.pop();
status.clear();
status.push_back(stackTopChar);
string production = analysisTable.at(stackTopChar).at(currentChar);
status += "-> " + production;
for (int i = (int) production.size() - 1; i >= 0; --i) {
char c = production[i];
if (c != '~') {
analysisStack.push(c);
}
}
}
}
step++;
cout << setw(16) << left << setfill(' ') << status << endl;
cout << setw(16) << left << setfill(' ') << step;
cout << setw(16) << left << setfill(' ') << getStackContent(analysisStack);
cout << setw(16) << left << setfill(' ') << remain;
}
cout << endl;
return true;
}
Checker::Checker(Grammar grammar) : grammar(std::move(grammar)){}

@ -0,0 +1,26 @@
//
// Created by lenfrex on 2023/5/5.
//
#ifndef LEXER_CPP_CHECKER_H
#define LEXER_CPP_CHECKER_H
#include "stack"
#include "grammar/Grammar.h"
class Checker {
private:
Grammar grammar;
static std::string getStackContent(std::stack<char> source);
public:
explicit Checker(Grammar grammar);
bool identifyString(std::string input);
static bool canDeducedEmpty(const std::set<std::string> &productions);
};
#endif //LEXER_CPP_CHECKER_H

@ -0,0 +1,58 @@
//
// Created by lenfrex on 2023/5/8.
//
#include "Grammar.h"
using namespace std;
// 求是否有交集
bool hasUnion(const set<char> &a, const set<char> &b) {
for (const auto &selectChars: b) {
if (a.contains(selectChars)) {
return true;
}
}
return false;
}
// 检查是否为LL1文法
bool Grammar::isLL1Grammar() {
// 逐个非终止符检查
for (const auto &nonTerminal: nonTerminals) {
SelectSet select = selectSet[nonTerminal];
// 取一行
for (auto current = select.begin(); current != select.end();) {
const auto &productionSelectSet = current->second;
// 取剩下的行进行比较,注意这里已经++current了,外层for就不要++了
for (auto innerCurrent = ++current; innerCurrent != select.end(); innerCurrent++) {
if (hasUnion(productionSelectSet, innerCurrent->second)) {
return false;
}
}
}
}
return true;
}
void Grammar::buildAnalysisTable() {
if (!isLL1Grammar()) {
throw NotSupportedGrammarException("文法定义不符合LL1文法");
}
for (const auto &nonTerminal: nonTerminals) {
SelectSet select = selectSet[nonTerminal];
for (const auto &selectPair: select) {
const auto &prod = selectPair.first;
const auto &selectChars = selectPair.second;
// 填充预测分析表
for (const auto &selectableTerminal: selectChars) {
analysisTable[nonTerminal][selectableTerminal] = prod;
}
}
}
}

@ -0,0 +1,53 @@
//
// Created by lenfrex on 2023/5/7.
//
#include "Grammar.h"
using namespace std;
void Grammar::generateFirstSet() {
// 处理每一个非终止符
for (char nonTerminal: nonTerminals) {
calcFirstSetForNonTerminal(nonTerminal);
}
}
void Grammar::calcFirstSetForNonTerminal(char nonTerminal) {
if (!productionsMap.contains(nonTerminal)) {
return;
}
const set<string> &productions = productionsMap.at(nonTerminal);
// 处理非终止符的每个产生式
for (const auto &production: productions) {
for (char currentChar: production) {
// 如果这个字符是终止符那就直接加到终止符集合里面,并退出逐字符解析
if (terminals.contains(currentChar)) {
firstSet[nonTerminal].insert(currentChar);
break;
}
// 该非终止符是否已经有first集合了,有的话就全部加到里面去
// 否则递归处理,先计算这个非终止符的first
ProcessNonTerminal:
if (firstSet.contains(currentChar)) {
firstSet[nonTerminal].insert(firstSet[currentChar].begin(), firstSet[currentChar].end());
} else {
// 如果左递归了,就先处理其他的表达式,但是这个时候其实已经不算是LL1文法了
if (currentChar == nonTerminal) {
break;
}
calcFirstSetForNonTerminal(currentChar);
// 递归完还要回来继续处理之前待处理的非终止符,如S->A..., A->a, 递归处理完A后还要返回继续处理S
goto ProcessNonTerminal;
}
// 如果这个非终止符不会推出空字符,就直接退出解析
// 能推出空字符就继续
if (!canDeducedEmpty(productionsMap.at(currentChar))) {
break;
}
}
}
}

@ -0,0 +1,187 @@
//
// Created by lenfrex on 2023/5/7.
//
#include "Grammar.h"
using namespace std;
#define isEmptyStr(c) (c == '~')
void Grammar::generateFollowSet() {
this->followPreProcess();
// 不断补充follow集,直到总的follow集大小不会再变大为止
int beforeSize = getTotalFollowSize();
for (int afterSize = -1; afterSize != beforeSize; afterSize = getTotalFollowSize()) {
beforeSize = getTotalFollowSize();
this->followFinalProcess();
}
}
int Grammar::getTotalFollowSize() {
int count = 0;
for (const auto &followPair: followSet) {
count += (int) followPair.second.size();
}
return count;
}
void searchForProduction(const Grammar &grammar,
const string &production,
std::map<char, std::set<char>> &followSet);
void Grammar::followPreProcess() {
// 获取每一个非终止符,以获得所有的产生式
for (const auto &nonTerminal: nonTerminals) {
// 开始符号先加个'#'结束符
if (nonTerminal == startChar) {
followSet[nonTerminal].insert('#');
}
if (!productionsMap.contains(nonTerminal)) {
continue;
}
const set<string> &productions = productionsMap.at(nonTerminal);
// 对当前非终止符的所有的产生式进行解析,搜索所有的非终止符进行求follow
for (const auto &production: productions) {
searchForProduction(*this, production, this->followSet);
}
}
}
enum LoopStatus {
Finish, Continue
};
LoopStatus processNonTerminalFollow(const Grammar &grammar,
const string &production, const auto &currPos,
std::map<char, std::set<char>> &followSet);
void searchForProduction(const Grammar &grammar,
const string &production,
std::map<char, std::set<char>> &followSet) {
const auto &nonTerminals = grammar.getNonTerminals();
for (auto currPos = production.begin(); currPos != production.end(); currPos++) {
char currChar = *currPos;
// 如果当前的符号不是非终止符,就继续解析下一个字符,
// 直到遇到的是非终止符
if (!nonTerminals.contains(currChar)) {
continue;
}
LoopStatus result = processNonTerminalFollow(grammar, production, currPos, followSet);
switch (result) {
case Continue:
continue;
case Finish:
break;
}
}
}
LoopStatus processNonTerminalFollow(const Grammar &grammar,
const string &production, const auto &currPos,
std::map<char, std::set<char>> &followSet) {
const auto &terminals = grammar.getTerminals();
const auto &productionsMap = grammar.getProductionsMap();
const auto &firstSet = grammar.getFirstSet();
auto next = currPos + 1;
char currChar = *currPos;
ProcessNoneTerminal:
// 到了产生式尾部就直接添加结束符
if (next == production.end()) {
followSet[currChar].insert('#');
return Finish;
}
// 如果下一个字符就是终结符的话,直接加到结果里边去,然后继续处理后面可能会遇到的非终止符
// 如果下一个字符也是非终止符,把下一个非终止符的firs去掉空串之后加到当前的follow中
if (terminals.contains(*next)) {
followSet[currChar].insert(*next);
return Continue;
} else {
set<char> addFirst = firstSet.at(*next);
addFirst.erase('~');
followSet[currChar].insert(addFirst.begin(), addFirst.end());
// 如果下一个能推导出空串,那就还要继续找后面非终止符,知道非终止符不能推空为止
if (Grammar::canDeducedEmpty(productionsMap.at(*next))) {
next++;
goto ProcessNoneTerminal;
} else {
return Finish;
}
}
}
LoopStatus finalProcessNonTerminalFollow(const Grammar &grammar, char nonTerminal,
const string &production, const auto &currPos,
std::map<char, std::set<char>> &followSet);
void Grammar::followFinalProcess() {
// 处理每一个非终止符
for (char nonTerminal: nonTerminals) {
if (!productionsMap.contains(nonTerminal)) {
continue;
}
const set<string> &productions = productionsMap.at(nonTerminal);
// 对所有的产生式进行解析,搜索所有的非终止符进行求follow
for (const auto &production: productions) {
for (auto currPos = production.begin(); currPos != production.end(); currPos++) {
LoopStatus result = finalProcessNonTerminalFollow(*this, nonTerminal, production, currPos, this->followSet);
switch (result) {
case Continue:
continue;
case Finish:
break;
}
}
}
}
}
LoopStatus finalProcessNonTerminalFollow(
const Grammar &grammar, char nonTerminal,
const string &production, const auto &currPos,
std::map<char, std::set<char>> &followSet) {
char currChar = *currPos;
const auto &productionsMap = grammar.getProductionsMap();
const auto &nonTerminals = grammar.getNonTerminals();
// 如果当前的符号不是非终止符,就继续解析下一个字符,
// 直到遇到的是非终止符
if (!nonTerminals.contains(*currPos)) {
return Continue;
}
// 当前字符是当前产生式的尾部,添加产生式左部的follow进这个字符的follow中
auto next = currPos + 1;
if (next == production.end()) {
followSet[currChar].insert(followSet[nonTerminal].begin(), followSet[nonTerminal].end());
return Continue;
}
// 如果当前字符不是产生式尾部但是下一个字符是非终结符并且位于尾部,而且还能推空,也把产生式左部的follow添加进这个字符的follow中
auto nextNext = next + 1;
if (nextNext == production.end() && nonTerminals.contains(*next)) {
if (Grammar::canDeducedEmpty(productionsMap.at(*next))) {
followSet[currChar].insert(followSet[nonTerminal].begin(), followSet[nonTerminal].end());
return Continue;
}
}
}

@ -0,0 +1,234 @@
//
// Created by lenfrex on 2023/5/7.
//
#include <iomanip>
#include "Grammar.h"
using namespace std;
const set<char> &Grammar::getNonTerminals() const {
return nonTerminals;
}
const set<char> &Grammar::getTerminals() const {
return terminals;
}
#define isBlankSpace(c) (c == ' ')
Grammar::Grammar(const string &nonTerminalString, const string &terminalString,
const set<string> &productionTexts, char startChar) : startChar(startChar) {
for (char c: nonTerminalString) {
nonTerminals.insert(c);
}
for (char c: terminalString) {
terminals.insert(c);
}
// 解析每条产生式
for (const string &prodText: productionTexts) {
char nonTerminal = '\0';
for (char c: prodText) {
if (isBlankSpace(c)) {
continue;
} else {
nonTerminal = c;
break;
}
}
string::size_type start = prodText.find("->");
// 输入的产生式有误,跳过该条产生式
if (start == -1) {
break;
}
for (string::size_type i = start + 2; i < prodText.length(); ++i) {
// 跳过空格
if (isBlankSpace(prodText[i])) {
continue;
}
productionsMap[nonTerminal].insert(prodText.substr(i));
break;
}
}
generateFirstSet();
generateFollowSet();
generateSelectSet();
this->ll1Grammar = isLL1Grammar();
if (ll1Grammar) {
buildAnalysisTable();
}
}
const map<char, set<Express>> &Grammar::getGrammarExpresses() const {
return productionsMap;
}
const char emptyChar = '~';
#define isEmptyStr(c) (c == '~')
// 是否可以推导出空字符
bool Grammar::canDeducedEmpty(const set<string> &productions) {
for (const auto &production: productions) {
for (const auto &c: production) {
if (isEmptyStr(c)) {
return true;
}
}
}
return false;
}
ostream &alignPrint(ostream &os, const string &str) {
os << setw(8) << left << setfill(' ') << str;
return os;
}
ostream &alignPrint(ostream &os, char str) {
os << setw(8) << left << setfill(' ') << str;
return os;
}
ostream &operator<<(ostream &os, const Grammar &grammar) {
const auto &terminals = grammar.getTerminals();
const auto &nonTerminals = grammar.getNonTerminals();
os << "文法符号详情:" << endl;
os << "开始符号:" << grammar.getStartChar() << endl;
os << "终止符:Vt = {";
for (const auto &c: terminals) {
os << '\'' << c << '\'' << ", ";
}
os << "}" << endl;
os << "非终止符:Vn = {";
for (const auto &c: nonTerminals) {
os << '\'' << c << '\'' << ", ";
}
os << "}" << endl;
os << "================================" << endl;
os << " FirstSet " << endl;
os << "================================" << endl;
const auto &firstSet = grammar.getFirstSet();
for (const auto &nonTerminal: nonTerminals) {
os << "First(" << nonTerminal << ") = {";
const auto &firstChars = firstSet.at(nonTerminal);
for (const auto &firstChar: firstChars) {
os << '\'' << firstChar << '\'' << ", ";
}
os << "}" << endl;
}
os << "================================" << endl;
os << " FollowSet " << endl;
os << "================================" << endl;
const auto &followSet = grammar.getFollowSet();
for (const auto &nonTerminal: nonTerminals) {
os << "Follow(" << nonTerminal << ") = {";
const auto &followChars = followSet.at(nonTerminal);
for (const auto &followChar: followChars) {
os << '\'' << followChar << '\'' << ", ";
}
os << "}" << endl;
}
os << "================================" << endl;
os << " SelectSet " << endl;
os << "================================" << endl;
const auto &allSelectSet = grammar.getSelectSet();
for (const auto &nonTerminal: nonTerminals) {
const auto &nonTerminalSelectSet = allSelectSet.at(nonTerminal);
for (const auto &selectPair: nonTerminalSelectSet) {
os << "Select(" << nonTerminal << "->" << selectPair.first << ')' << "\t=\t" << "{";
for (const auto &selectChar: selectPair.second) {
os << '\'' << selectChar << '\'' << ", ";
}
os << "}" << endl;
}
}
os << "================================" << endl;
os << " AnalysisTable " << endl;
os << "================================" << endl;
alignPrint(os, "Vn/Vt");
os << "| ";
for (const auto &terminal: terminals) {
if (terminal != '~') {
alignPrint(os, terminal);
}
}
alignPrint(os, '#');
os << endl;
for (int i = 0; i < terminals.size(); ++i) {
os << "----------";
}
os << endl;
const auto &analysisTable = grammar.getAnalysisTable();
for (const auto &nonTerminal: nonTerminals) {
alignPrint(os, nonTerminal);
os << "| ";
for (const auto &terminal: terminals) {
if (terminal == '~') {
break;
}
bool found = analysisTable.contains(nonTerminal) && analysisTable.at(nonTerminal).contains(terminal);
string printText = found ? ("->" + analysisTable.at(nonTerminal).at(terminal)): "[/]";
alignPrint(os, printText);
}
bool found = analysisTable.contains(nonTerminal) && analysisTable.at(nonTerminal).contains('#');
string printText = found ? ("->" + analysisTable.at(nonTerminal).at('#')): "[/]";
alignPrint(os, printText);
os << endl;
}
return os;
}
char Grammar::getStartChar() const {
return startChar;
}
const map<char, std::set<Express>> &Grammar::getProductionsMap() const {
return productionsMap;
}
const map<char, std::set<char>> &Grammar::getFirstSet() const {
return firstSet;
}
const map<char, std::set<char>> &Grammar::getFollowSet() const {
return followSet;
}
const map<char, SelectSet> &Grammar::getSelectSet() const {
return selectSet;
}
const map<char, TableRow> &Grammar::getAnalysisTable() const {
return analysisTable;
}
bool Grammar::isLl1Grammar() const {
return ll1Grammar;
}

@ -0,0 +1,119 @@
//
// Created by lenfrex on 2023/5/7.
//
#ifndef LEXER_CPP_GRAMMAR_H
#define LEXER_CPP_GRAMMAR_H
#include <ostream>
#include "vector"
#include "string"
#include "set"
#include "map"
typedef std::string Express;
// 单个产生式的select集合
typedef std::pair<Express, std::set<char>> ExpressSelectSet;
// 多个产生式的Select集合set,同一个非终止符的所有推导式的select集合
typedef std::set<ExpressSelectSet> SelectSet;
typedef std::map<char, std::string> TableRow;
// 文法定义
class Grammar {
private:
char startChar;
// 非终止符,vn
std::set<char> nonTerminals = std::set<char>();;
// 终止符,vt
std::set<char> terminals = std::set<char>();;
// 文法产生式
std::map<char, std::set<Express>> productionsMap;
std::map<char, std::set<char>> firstSet = std::map<char, std::set<char>>();
std::map<char, std::set<char>> followSet = std::map<char, std::set<char>>();
std::map<char, SelectSet> selectSet = std::map<char, SelectSet>();
std::map<char, TableRow> analysisTable = std::map<char, std::map<char, std::string>>();
bool ll1Grammar = false;
void calcFirstSetForNonTerminal(char nonTerminal);
// 计算first集合
void generateFirstSet();
// 获取当前总的follow集合元素数量
int getTotalFollowSize();
// 计算follow集合
void generateFollowSet();
// follow集合预处理
void followPreProcess();
// follow集合完善处理
void followFinalProcess();
// 计算select集合
void generateSelectSet();
// 检查当前是否为LL1文法
bool isLL1Grammar();
// 构建预测分析表
void buildAnalysisTable();
public:
[[nodiscard]] const std::set<char> &getNonTerminals() const;
[[nodiscard]] const std::set<char> &getTerminals() const;
[[nodiscard]] const std::map<char, std::set<Express>> &getGrammarExpresses() const;
Grammar(const std::string &nonTerminalString, const std::string &terminalString,
const std::set<std::string> &productionTexts, char startChar);
[[nodiscard]] char getStartChar() const;
[[nodiscard]] const std::map<char, std::set<Express>> &getProductionsMap() const;
[[nodiscard]] const std::map<char, std::set<char>> &getFirstSet() const;
[[nodiscard]] const std::map<char, std::set<char>> &getFollowSet() const;
[[nodiscard]] const std::map<char, SelectSet> &getSelectSet() const;
[[nodiscard]] const std::map<char, TableRow> &getAnalysisTable() const;
[[nodiscard]] bool isLl1Grammar() const;
friend std::ostream &operator<<(std::ostream &os, const Grammar &grammar);
static bool canDeducedEmpty(const std::set<std::string> &productions);
};
// 文法不支持异常
class NotSupportedGrammarException : public std::exception {
private:
const std::string msg;
public:
[[nodiscard]] const char *what() const _GLIBCXX_TXN_SAFE_DYN _GLIBCXX_NOTHROW override {
return msg.data();
};
public:
explicit NotSupportedGrammarException(std::string msg) : msg(std::move(msg)) {};
[[nodiscard]] const std::string &getMsg() const {
return msg;
};
};
#endif //LEXER_CPP_GRAMMAR_H

@ -0,0 +1,72 @@
//
// Created by lenfrex on 2023/5/8.
//
#include "Grammar.h"
using namespace std;
#define isEmptyStr(c) (c == '~')
void Grammar::generateSelectSet() {
// 处理每一个非终止符,以取得产生式
for (const auto &leftNonTerminal: nonTerminals) {
if (!productionsMap.contains(leftNonTerminal)) {
continue;
}
const set<string> &productions = productionsMap.at(leftNonTerminal);
// 解析非终止符的每个产生式
for (const auto &production: productions) {
ExpressSelectSet productionSelectSet = ExpressSelectSet();
productionSelectSet.first = production;
productionSelectSet.second = set<char>();
// 解析产生式,一个一个字符处理
for (auto currPos = production.begin(); currPos != production.end(); currPos++) {
char currChar = *currPos;
// 如果当前字符是终止符
// - 是空串,把产生式左部非终止符(即A->a...中的A, 此处即为leftNonTerminal)的follow加到当前产生式的select中(productionSelectSet);
// - 不是空串,就直接把这个符号加入到当前产生式的select
//
// 如果是非终结符把,则把当前字符的first删去空串之后加入当前产生式的select中
// - 当前非终止符可以推出空串,就继续往后解析处理当前非终止符为空时的情况;
// - 如果是最后一个字符,把产生式左部follow加当前产生式的select中;
// - 如果不是最后一个字符,则继续看下一个字符(continue)
// - 不能推出空串,说明该条产生式select集合的计算结束(break)
if (terminals.contains(currChar)) {
if (isEmptyStr(currChar)) {
productionSelectSet.second.insert(followSet[leftNonTerminal].begin(), followSet[leftNonTerminal].end());
} else {
productionSelectSet.second.insert(currChar);
}
break;
} else {
// 前字符的first删去空串之后加入到产生式的select中
set<char> currFirst = firstSet[currChar];
currFirst.erase('~');
productionSelectSet.second.insert(currFirst.begin(), currFirst.end());
// 如果当前字符能推出空串,就看看下一个字符怎么样
// 不能推出空串就break退出
if (canDeducedEmpty(productionsMap.at(currChar))) {
// 如果现在的这个字符已经是产生式末尾了,就把产生式左部follow加进去
// 否则转到下一个字符
auto next = currPos + 1;
if (next == production.end()) {
productionSelectSet.second.insert(followSet[leftNonTerminal].begin(), followSet[leftNonTerminal].end());
} else {
continue;
}
} else {
break;
}
}
}
selectSet[leftNonTerminal].insert(productionSelectSet);
}
}
}

@ -0,0 +1,80 @@
//
// Created by lenfrex on 2023/5/7.
//
#include <fstream>
#include <set>
#include "iostream"
#include "grammar/Grammar.h"
#include "Checker.h"
using namespace std;
int main() {
#ifdef __WIN32__
system("chcp 65001");
system("cls");
#endif
fstream file;
file.open("../grammar.txt", ios::in);
if (!file.is_open()) {
cout << "打开文件时发生错误";
return -1;
}
// 读非终止符行
string nonTerminals;
getline(file, nonTerminals);
// 读终止符行
string terminals;
getline(file, terminals);
cout << "读入非终止符:\t" << nonTerminals << endl;
cout << "读入终止符:\t" << terminals << endl;
string str;
set<string> grammarExpresses = set<string>();
// 跳过中间用来分隔的一行
getline(file, str);
while (getline(file, str)) {
grammarExpresses.insert(str);
cout << "读入文法产生式:\t" << str << endl;
}
cout << "--------------------------------" << endl;
char startChar = nonTerminals.at(0);
Grammar grammar = Grammar(nonTerminals, terminals, grammarExpresses, startChar);
cout << grammar << endl;
cout << "--------------------------------" << endl;
try {
Checker checker = Checker(grammar);
string input;
cout << "输入待分析串:";
cin >> input;
cout << "分析:" << input << endl;
cout << "----------------------------------------------------------------" << endl;
bool accepted = checker.identifyString(input);
if (!accepted) {
cout << "输入串未接受" << endl;
} else {
cout << "输入串接受" << endl;
}
} catch (NotSupportedGrammarException &e) {
cout << e.getMsg() << endl;
cout << "请检查文法定义是否符合LL1文法规则。" << endl;
return -1;
}
return 0;
}
Loading…
Cancel
Save