某个编译原理的实验代码存档
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
compile-work/lexer/Lexer.cpp

218 lines
6.1 KiB

2 years ago
//
// Created by lenfrex on 2023/4/13.
//
#include "Lexer.h"
#include "Token.h"
using namespace std;
Lexer::Lexer(char* code) {
buffer = code;
}
/*
*
*/
void Lexer::lexAll() {
while (!stop && *buffer != '\0') {
Token token = Token();
Lexer::lexToken(token);
results.push_back(token);
}
}
const vector<Token> &Lexer::getResults() const {
return results;
}
#define isWhitespace(x) (x == ' ' || x == '\t')
// 仿照Clang编译器前端的Lexer中Lexer::LexTokenInternal方法的实现
// 经过极度的精简,只是模仿了大致的思路,具体流程也不是按照Clang原版的去实现的
void Lexer::lexToken(Token &token) {
LexStart:
// 过滤空格等
if (isWhitespace(*buffer)) {
do {
++buffer;
++currPos;
} while (isWhitespace(*buffer));
}
switch (*buffer) {
case '\0':
stop = true;
// 换行,重置相关参数
case '\r':
buffer++;
currLine++;
currPos = 0;
goto LexStart;
case '\n':
buffer++;
currLine++;
currPos = 0;
goto LexStart;
// 处理小于号和小于号开头的符号
case '<': {
char next = *(buffer+1);
if (next == '=') {
int length = 2;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
// 实际上TokenType其实应该更详细点,例如大于等于号,大于号,逗号这种具体的符号
// 为简单起见,只分成五大类,况且题目只要求分成五大类(
// 这里加了个"<<"这种符号的解析只不过是为了展示分类具体符号情况下的实现
} else if (next == '<') {
int length = 2;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
} else {
int length = 1;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
}
}
case '>': {
char next = *(buffer+1);
if (next == '=') {
int length = 2;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
} else {
int length = 1;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
}
}
case ':': {
char next = *(buffer+1);
if (next == '=') {
int length = 2;
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
return Lexer::move(length);
} else {
int length = 1;
Lexer::setToken(token, TokenType::UNKNOWN, buffer, length);
return Lexer::move(length);
}
}
// 单符号运算符
case '+': case '-': case '*':
case '/': case '#': case '=': {
Lexer::setToken(token, TokenType::OPERATOR, buffer, 1);
return Lexer::move(1);
}
// 界符,句号'.'特别关照
case '.':
// stop = true;
case '(': case ')': case ',':
case ';': /* '.' */ {
Lexer::setToken(token, TokenType::DELIMITER, buffer, 1);
return Lexer::move(1);
}
// 数字
case '0' ... '9': {
return Lexer::lexNumber(token);
}
// 标识符,为简单起见,这里只接受ascii字母'a'-'z','A'-'Z'以及下划线'_'作为标识符,utf8字符不考虑
case 'A' ... 'Z':
case 'a' ... 'z':
case '_' : {
return Lexer::lexIdentifier(token);
}
default:
Lexer::setToken(token, TokenType::UNKNOWN, buffer, 1);
return Lexer::move(1);
}
}
#define isNumber(x) (x >= '0' && x <= '9')
#define isIllegalIdentifier(x) ((x >= '0' && x <= '9') || \
(x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || \
(x == '_'))
inline void Lexer::lexNumber(Token &token) {
char *next = buffer + 1;
int length = 1;
while (isNumber(*next)) {
next++;
length++;
}
Lexer::setToken(token, TokenType::CONSTANTS, buffer, length);
return Lexer::move(length);
}
inline void Lexer::lexIdentifier(Token &token) {
// 传进来的时候已经有一个字符了,所以跳过第一个字符
char *next = buffer + 1;
int length = 1;
while (isIllegalIdentifier(*next)) {
next++;
length++;
}
string content = Lexer::copyAsString(buffer, length);
// 检查是否关键字
for (const string & keyword : keywords) {
if (content == keyword) {
Lexer::setToken(token, TokenType::KEYWORD, content, length);
return Lexer::move(length);
}
}
Lexer::setToken(token, TokenType::IDENTIFIER, content, length);
return Lexer::move(length);
}
inline string Lexer::copyAsString(const char *sourceStart, int length) {
// 预留一个长度最后一个放置'\0'
char *tmp = new char[length + 1]{0};
for (int i = 0; i < length; ++i) {
tmp[i] = *(sourceStart + i);
}
string result;
result.append(tmp);
delete[] tmp;
return result;
}
void Lexer::setToken(Token &token, TokenType type, const char *source, int length) {
token.setType(type);
token.setLocation(Location{.line=currLine, .start=currPos, .length=length});
token.setContent(copyAsString(source, length));
}
void Lexer::setToken(Token &token, TokenType type, const string &content, int length) {
token.setType(type);
token.setLocation(Location{.line=currLine, .start=currPos, .length=length});
token.setContent(content);
}
void Lexer::move(int offset) {
currPos += offset;
buffer = buffer + offset;
}