研究生:修正课表时间解析算法

old-package
lensfrex 2 years ago
parent 2102a7ac18
commit 5db63aa13b
Signed by: lensfrex
GPG Key ID: 0F69A0A2FBEE98A0
  1. 98
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/graduate/GraduateCourseTableParser.java

@ -16,9 +16,15 @@ import java.util.regex.Pattern;
public class GraduateCourseTableParser implements Parser<List<Course>> { public class GraduateCourseTableParser implements Parser<List<Course>> {
private static final Pattern COURSE_TABLE_REGEX = Pattern.compile("课程:(?<name>.*?)<br>班级:(?<class>.*?)<br>\\((?<classRoom>.*?)\\)<br>(?<week>.*?)<br>(?<section>.*?)<br>主讲教师:(?<teacher>.*?)<br>"); private static final Pattern COURSE_TABLE_REGEX = Pattern.compile("课程:(?<name>.*?)<br>班级:(?<class>.*?)<br>\\((?<classRoom>.*?)\\)<br>(?<week>.*?) (?<weekdaySection>.*?)<br>主讲教师:(?<teacher>.*?)<br>");
private static final Pattern WEEK_REGEX = Pattern.compile("(?<startWeek>\\d+)-(?<endWeek>\\d+)周.*?星期(?<weekDay>[一二三四五六七日天])"); private static final Pattern DIGITAL_REGEX = Pattern.compile("\\d+");
private static final Pattern WEEKDAY_SECTION_REGEX = Pattern.compile("星期(?<weekDay>[一二三四五六七日天])<br>(?<sectionText>([上下晚]\\d,?)+)");
private static final Pattern SECTION_REGEX = Pattern.compile("(?<dayTime>[上下晚])(?<section>\\d)");
private static final String courseGirdXpath = "//*[@id=\"DataGrid1\"]/tbody/tr/td[@rowspan]";
@Override @Override
public List<Course> parse(String html) throws ParseException { public List<Course> parse(String html) throws ParseException {
@ -28,15 +34,15 @@ public class GraduateCourseTableParser implements Parser<List<Course>> {
throw new ParseException("解析研究生课表失败:关键元素不存在...", html); throw new ParseException("解析研究生课表失败:关键元素不存在...", html);
} }
// 初步拿到所有的课程格子 Elements courseGirds = table.selectXpath(courseGirdXpath);
Elements girds = table.getElementsByAttribute("rowspan");
String girdsHtml = girds.outerHtml();
List<Course> courses = new ArrayList<>(girds.size()); List<Course> courses = new ArrayList<>(courseGirds.size());
Course.CourseBuilder courseBuilder = Course.builder(); Course.CourseBuilder courseBuilder = Course.builder();
for (Element courseGird : courseGirds) {
String girdHtml = courseGird.html();
// 正则提取每一段课程文本 // 正则提取每一段课程文本
Matcher matcher = COURSE_TABLE_REGEX.matcher(girdsHtml); Matcher matcher = COURSE_TABLE_REGEX.matcher(girdHtml);
while (matcher.find()) { while (matcher.find()) {
String name = matcher.group("name"); String name = matcher.group("name");
courseBuilder.name(name); courseBuilder.name(name);
@ -53,8 +59,9 @@ public class GraduateCourseTableParser implements Parser<List<Course>> {
String weekStr = matcher.group("week"); String weekStr = matcher.group("week");
this.parseWeek(weekStr, courseBuilder); this.parseWeek(weekStr, courseBuilder);
String section = matcher.group("section"); this.parseWeekdaySectionAndFillCourse(matcher.group("weekdaySection"), courseBuilder, courses);
this.fillCourseList(section, courseBuilder, courses); }
} }
return courses; return courses;
@ -64,36 +71,54 @@ public class GraduateCourseTableParser implements Parser<List<Course>> {
* 解析周次信息文本 * 解析周次信息文本
* *
* @param weekText 周次文本3-14周:连续周 星期三 * @param weekText 周次文本3-14周:连续周 星期三
* @param builder Lesson的builder * @param builder Course的builder
*/ */
private void parseWeek(String weekText, Course.CourseBuilder builder) { private void parseWeek(String weekText, Course.CourseBuilder builder) {
Matcher matcher = WEEK_REGEX.matcher(weekText); Matcher matcher = DIGITAL_REGEX.matcher(weekText);
if (matcher.find()) { if (matcher.find()) {
String startWeek = matcher.group("startWeek"); String startWeek = matcher.group();
String endWeek = matcher.group("endWeek");
String weekDay = matcher.group("weekDay"); // 一直匹配搜寻到最后一个数字,即为结束周次,第一次就匹配不到就是单周课
// 实际上单周课写的也是"3-3"这样子,但是这样写兼容性比较好
String endWeek = startWeek;
while (matcher.find()) {
endWeek = matcher.group();
}
builder.startWeek(Integer.parseInt(startWeek)); builder.startWeek(Integer.parseInt(startWeek));
builder.endWeek(Integer.parseInt(endWeek)); builder.endWeek(Integer.parseInt(endWeek));
builder.weekDay(Course.getWeekDayNumber(weekDay));
} }
} }
/** /**
* 解析节次并将解析出来的完整的节次放入List中 * 解析时间并填充最后的解析结果
* *
* @param sectionText 提取出来的节次文本上1,上2,上3,上4,下1,下2 * @param timeText 时间字段 "星期五&lt;br&gt;上3,上4", "星期六&lt;br&gt;上1,上2,上3,上4"
* @param builder LessonImpl中的builder * @param builder 已经解析好其他必要数据的courseBuilder
* @param courses 存放Lesson的List * @param courses 解析结果List
*/ */
private void fillCourseList(String sectionText, Course.CourseBuilder builder, List<Course> courses) { private void parseWeekdaySectionAndFillCourse(String timeText, Course.CourseBuilder builder, List<Course> courses) {
String[] sections = sectionText.split(","); Matcher timeMatcher = WEEKDAY_SECTION_REGEX.matcher(timeText);
for (int i = 0; i < sections.length / 2; i += 2) { // 解析星期和节次,一直匹配,匹配到一次就是一次连续课
int startSection = this.getSection(sections[i]); while (timeMatcher.find()) {
int endSection = this.getSection(sections[i + 1]); // 解析星期
builder.weekDay(Course.getWeekDayNumber(timeMatcher.group("weekDay")));
// 解析节次
Matcher sectionMatcher = SECTION_REGEX.matcher(timeMatcher.group("sectionText"));
if (sectionMatcher.find()) {
int startSection = getSection(sectionMatcher);
// todo 这段可以稍微优化一下
// 一直匹配,最后的那个就是结束节次,如果第一次就不匹配的话就是单节课
int endSection = startSection;
while (sectionMatcher.find()) {
endSection = getSection(sectionMatcher);
}
builder.startSection(startSection); builder.startSection(startSection);
builder.endSection(endSection); builder.endSection(endSection);
}
courses.add(builder.build()); courses.add(builder.build());
} }
@ -102,20 +127,31 @@ public class GraduateCourseTableParser implements Parser<List<Course>> {
/** /**
* 将上1下2晚1这种相对的节次格式转换为相应的绝对节次 * 将上1下2晚1这种相对的节次格式转换为相应的绝对节次
* *
* @param time 类似于上1下2晚1这种相对的节次格式文本 * @param dateTime 下这种早上晚上的文本
* @param sectionText 123这种相对的节次数文本
* @return 相应的绝对节次 * @return 相应的绝对节次
*/ */
private int getSection(String time) { private int getSection(String dateTime, String sectionText) {
int i = time.charAt(1) - 48; int section = Integer.parseInt(sectionText);
switch (time.charAt(0)) { switch (dateTime.charAt(0)) {
case '上': case '上':
return i; return section;
case '下': case '下':
return i + 4; return section + 4;
case '晚': case '晚':
return i + 8; return section + 8;
default: default:
return 1; return 1;
} }
} }
/**
* 通过正则匹配器计算节次
*
* @param matcher 以及匹配好的matcher
* @return 相应的绝对节次
*/
private int getSection(Matcher matcher) {
return getSection(matcher.group("dayTime"), matcher.group("section"));
}
} }

Loading…
Cancel
Save