You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
218 lines
6.1 KiB
218 lines
6.1 KiB
2 years ago
|
//
|
||
|
// Created by lenfrex on 2023/4/13.
|
||
|
//
|
||
|
|
||
|
#include "Lexer.h"
|
||
|
|
||
|
#include "Token.h"
|
||
|
|
||
|
using namespace std;
|
||
|
|
||
|
Lexer::Lexer(char* code) {
|
||
|
buffer = code;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* 对所有文本进行词法分析
|
||
|
*/
|
||
|
void Lexer::lexAll() {
|
||
|
while (!stop && *buffer != '\0') {
|
||
|
Token token = Token();
|
||
|
Lexer::lexToken(token);
|
||
|
results.push_back(token);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
const vector<Token> &Lexer::getResults() const {
|
||
|
return results;
|
||
|
}
|
||
|
|
||
|
#define isWhitespace(x) (x == ' ' || x == '\t')
|
||
|
|
||
|
// 仿照Clang编译器前端的Lexer中Lexer::LexTokenInternal方法的实现
|
||
|
// 经过极度的精简,只是模仿了大致的思路,具体流程也不是按照Clang原版的去实现的
|
||
|
void Lexer::lexToken(Token &token) {
|
||
|
LexStart:
|
||
|
// 过滤空格等
|
||
|
if (isWhitespace(*buffer)) {
|
||
|
do {
|
||
|
++buffer;
|
||
|
++currPos;
|
||
|
} while (isWhitespace(*buffer));
|
||
|
}
|
||
|
|
||
|
switch (*buffer) {
|
||
|
case '\0':
|
||
|
stop = true;
|
||
|
|
||
|
// 换行,重置相关参数
|
||
|
case '\r':
|
||
|
buffer++;
|
||
|
currLine++;
|
||
|
currPos = 0;
|
||
|
goto LexStart;
|
||
|
case '\n':
|
||
|
buffer++;
|
||
|
currLine++;
|
||
|
currPos = 0;
|
||
|
goto LexStart;
|
||
|
// 处理小于号和小于号开头的符号
|
||
|
case '<': {
|
||
|
char next = *(buffer+1);
|
||
|
if (next == '=') {
|
||
|
int length = 2;
|
||
|
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
|
||
|
return Lexer::move(length);
|
||
|
|
||
|
// 实际上TokenType其实应该更详细点,例如大于等于号,大于号,逗号这种具体的符号
|
||
|
// 为简单起见,只分成五大类,况且题目只要求分成五大类(
|
||
|
// 这里加了个"<<"这种符号的解析只不过是为了展示分类具体符号情况下的实现
|
||
|
} else if (next == '<') {
|
||
|
int length = 2;
|
||
|
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
|
||
|
return Lexer::move(length);
|
||
|
|
||
|
} else {
|
||
|
int length = 1;
|
||
|
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
|
||
|
return Lexer::move(length);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
case '>': {
|
||
|
char next = *(buffer+1);
|
||
|
if (next == '=') {
|
||
|
int length = 2;
|
||
|
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
|
||
|
return Lexer::move(length);
|
||
|
|
||
|
} else {
|
||
|
int length = 1;
|
||
|
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
|
||
|
return Lexer::move(length);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
case ':': {
|
||
|
char next = *(buffer+1);
|
||
|
if (next == '=') {
|
||
|
int length = 2;
|
||
|
Lexer::setToken(token, TokenType::OPERATOR, buffer, length);
|
||
|
return Lexer::move(length);
|
||
|
|
||
|
} else {
|
||
|
int length = 1;
|
||
|
Lexer::setToken(token, TokenType::UNKNOWN, buffer, length);
|
||
|
return Lexer::move(length);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// 单符号运算符
|
||
|
case '+': case '-': case '*':
|
||
|
case '/': case '#': case '=': {
|
||
|
Lexer::setToken(token, TokenType::OPERATOR, buffer, 1);
|
||
|
return Lexer::move(1);
|
||
|
}
|
||
|
|
||
|
// 界符,句号'.'特别关照
|
||
|
case '.':
|
||
|
// stop = true;
|
||
|
case '(': case ')': case ',':
|
||
|
case ';': /* '.' */ {
|
||
|
Lexer::setToken(token, TokenType::DELIMITER, buffer, 1);
|
||
|
return Lexer::move(1);
|
||
|
}
|
||
|
|
||
|
// 数字
|
||
|
case '0' ... '9': {
|
||
|
return Lexer::lexNumber(token);
|
||
|
}
|
||
|
|
||
|
// 标识符,为简单起见,这里只接受ascii字母'a'-'z','A'-'Z'以及下划线'_'作为标识符,utf8字符不考虑
|
||
|
case 'A' ... 'Z':
|
||
|
case 'a' ... 'z':
|
||
|
case '_' : {
|
||
|
return Lexer::lexIdentifier(token);
|
||
|
}
|
||
|
|
||
|
default:
|
||
|
Lexer::setToken(token, TokenType::UNKNOWN, buffer, 1);
|
||
|
return Lexer::move(1);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#define isNumber(x) (x >= '0' && x <= '9')
|
||
|
#define isIllegalIdentifier(x) ((x >= '0' && x <= '9') || \
|
||
|
(x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z') || \
|
||
|
(x == '_'))
|
||
|
|
||
|
inline void Lexer::lexNumber(Token &token) {
|
||
|
char *next = buffer + 1;
|
||
|
int length = 1;
|
||
|
|
||
|
while (isNumber(*next)) {
|
||
|
next++;
|
||
|
length++;
|
||
|
}
|
||
|
|
||
|
Lexer::setToken(token, TokenType::CONSTANTS, buffer, length);
|
||
|
|
||
|
return Lexer::move(length);
|
||
|
}
|
||
|
|
||
|
inline void Lexer::lexIdentifier(Token &token) {
|
||
|
// 传进来的时候已经有一个字符了,所以跳过第一个字符
|
||
|
char *next = buffer + 1;
|
||
|
int length = 1;
|
||
|
|
||
|
while (isIllegalIdentifier(*next)) {
|
||
|
next++;
|
||
|
length++;
|
||
|
}
|
||
|
|
||
|
string content = Lexer::copyAsString(buffer, length);
|
||
|
|
||
|
// 检查是否关键字
|
||
|
for (const string & keyword : keywords) {
|
||
|
if (content == keyword) {
|
||
|
Lexer::setToken(token, TokenType::KEYWORD, content, length);
|
||
|
return Lexer::move(length);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
Lexer::setToken(token, TokenType::IDENTIFIER, content, length);
|
||
|
|
||
|
return Lexer::move(length);
|
||
|
}
|
||
|
|
||
|
inline string Lexer::copyAsString(const char *sourceStart, int length) {
|
||
|
// 预留一个长度最后一个放置'\0'
|
||
|
char *tmp = new char[length + 1]{0};
|
||
|
for (int i = 0; i < length; ++i) {
|
||
|
tmp[i] = *(sourceStart + i);
|
||
|
}
|
||
|
|
||
|
string result;
|
||
|
result.append(tmp);
|
||
|
|
||
|
delete[] tmp;
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
void Lexer::setToken(Token &token, TokenType type, const char *source, int length) {
|
||
|
token.setType(type);
|
||
|
token.setLocation(Location{.line=currLine, .start=currPos, .length=length});
|
||
|
token.setContent(copyAsString(source, length));
|
||
|
}
|
||
|
|
||
|
void Lexer::setToken(Token &token, TokenType type, const string &content, int length) {
|
||
|
token.setType(type);
|
||
|
token.setLocation(Location{.line=currLine, .start=currPos, .length=length});
|
||
|
token.setContent(content);
|
||
|
}
|
||
|
|
||
|
void Lexer::move(int offset) {
|
||
|
currPos += offset;
|
||
|
buffer = buffer + offset;
|
||
|
}
|