本科生:调整课程解析算法;调整了课程对象的比较条件

old-package
lensfrex 2 years ago
parent ac79a9142e
commit 2102a7ac18
Signed by: lensfrex
GPG Key ID: 0F69A0A2FBEE98A0
  1. 4
      mywust-common/src/main/java/cn/linghang/mywust/data/global/Course.java
  2. 98
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/UndergradCourseTableParser.java

@ -75,12 +75,12 @@ public class Course {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Course course = (Course) o;
return startWeek == course.startWeek && endWeek == course.endWeek && weekDay == course.weekDay && startSection == course.startSection && endSection == course.endSection && Objects.equals(teachClass, course.teachClass);
return startWeek == course.startWeek && endWeek == course.endWeek && weekDay == course.weekDay && startSection == course.startSection && endSection == course.endSection && name.equals(course.name) && teacher.equals(course.teacher) && teachClass.equals(course.teachClass);
}
@Override
public int hashCode() {
return Objects.hash(teachClass, startWeek, endWeek, weekDay, startSection, endSection);
return Objects.hash(name, teacher, teachClass, startWeek, endWeek, weekDay, startSection, endSection);
}
private static Map<String, Integer> makeWeekdayMap() {

@ -24,8 +24,21 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
private static final String COURSE_SPLIT_TAG_STR = "</div><div>";
private static final Pattern WEEK_REGEX = Pattern.compile("\\d+");
// 用来匹配数字的,位数不限
private static final Pattern DIGITAL_PATTERN = Pattern.compile("\\d+");
// 例:1-17(周)[03-04节]; 1-2,4-7(周)[01-02节]
// 容易看一点的:(?<week>.*?)(周)[(?<section>.*?)节]
// 提取出来后:week: 1-17, section: 03-04; week: 1-2,4-7, section: 01-02
private static final Pattern WEEK_SECTION_REGEX = Pattern.compile("(?<week>.*?)\\(周\\)\\[(?<section>.*?)节]");
/**
* 解析课程可能会有重复的课程调用者需要手动去重
*
* @param html 原页面html
* @return 解析好的课程List
* @throws ParseException 解析课表时出现任何问题
*/
@Override
public List<Course> parse(String html) throws ParseException {
try {
@ -40,12 +53,12 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
List<Course> courses = new ArrayList<>(girds.size());
// 遍历每个格子,使用girdCount计数格子来计算节次信息
// 遍历每个格子,使用girdCount计数格子来计算星期
int girdCount = 0;
for (Element gird : girds) {
girdCount++;
// 将分隔符替换成标签,方便解析
// 将分隔符替换成标签,方便重新解析格子
String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR);
Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div");
for (Element courseElement : courseElements) {
@ -62,7 +75,7 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
courseBuilder.name(courseName);
// 直接获取格子里所有课程的关键字段,每个下表对应格子里相应的课程
// 直接获取格子里的关键信息
Elements classElements = courseElement.getElementsByAttributeValue("title", "课堂名称");
Elements teacherElements = courseElement.getElementsByAttributeValue("title", "老师");
Elements timeElements = courseElement.getElementsByAttributeValue("title", "周次(节次)");
@ -78,31 +91,8 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
int weekDay = girdCount % 7;
courseBuilder.weekDay(weekDay == 0 ? 7 : weekDay);
// 靠行位置来确定节次和星期,而不是靠time字段的数据确定(因为太不好处理了)
// 对于只有一个小节的课程,这类课程多数是在线课程,这里一律按照两小节大课处理
// 具体算法就是行索引x2 + 1就是开始的节次(索引从0开始)
int lineIndex = (girdCount / 7);
courseBuilder.startSection(lineIndex * 2 + 1);
courseBuilder.endSection(lineIndex * 2 + 2);
// 切割连续周信息,如"1-2,4-6(周)"这样两段的连续周(1-3周和5-10周)
// 不直接使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,效率不太行
String timeText = StringUtil.split(JsoupUtil.getElementText(timeElements, 0), ',').get(0);
List<String> times = StringUtil.split(timeText, ',');
for (String time : times) {
Matcher weekMatcher = WEEK_REGEX.matcher(time);
// 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming
if (!weekMatcher.find()) {
continue;
}
// 第二次matcher.find()匹配结束周,如果没有数字匹配说明是单周课程
int startWeek = Integer.parseInt(weekMatcher.group());
int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek;
courseBuilder.startWeek(startWeek).endWeek(endWeek);
courses.add(courseBuilder.build());
}
String timeText = JsoupUtil.getElementText(timeElements, 0);
this.parseTime(timeText, courseBuilder, courses);
}
}
@ -113,4 +103,52 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
throw new ParseException("解析课表时出现问题:" + e, html);
}
}
/**
* 解析周次和节次时间主要使用正则解析
*
* @param timeText 周次节次文本1-17()[03-04节]; 1-2,4-7()[01-02节]
* @param courseBuilder courseBuilder
* @param courses 课程解析结果列表
*/
private void parseTime(String timeText, Course.CourseBuilder courseBuilder, List<Course> courses) {
Matcher timeMatcher = WEEK_SECTION_REGEX.matcher(timeText);
if (timeMatcher.find()) {
// 解析节次,这种方法相比于通过定位格子来确定节次更准确,但是可能会出现重复的课程
String sectionString = timeMatcher.group("section");
Matcher sectionMatcher = DIGITAL_PATTERN.matcher(sectionString);
if (sectionMatcher.find()) {
int startSection = Integer.parseInt(sectionMatcher.group());
// 不断匹配下一个数字,直到最后一个,即为结束节次数字,如果第一次就不匹配,则为单节课
int endSection = startSection;
while (sectionMatcher.find()) {
endSection = Integer.parseInt(sectionMatcher.group());
}
courseBuilder.startSection(startSection);
courseBuilder.endSection(endSection);
}
String weekString = timeMatcher.group("week");
// 切割连续周信息,如"1-2,4-6(周)"这样两段的连续周(1-3周和5-10周)
// 不直接使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,效率不太行,但是有一说一,其实可以忽略
List<String> weekTexts = StringUtil.split(weekString, ',');
for (String weekText : weekTexts) {
Matcher weekMatcher = DIGITAL_PATTERN.matcher(weekText);
// 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming
if (!weekMatcher.find()) {
continue;
}
// 第二次matcher.find()匹配结束周,如果没有数字匹配说明是单周课程
int startWeek = Integer.parseInt(weekMatcher.group());
int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek;
courseBuilder.startWeek(startWeek).endWeek(endWeek);
courses.add(courseBuilder.build());
}
}
}
}

Loading…
Cancel
Save