From 2102a7ac1845e4c068e849fcc8ad0b041a392934 Mon Sep 17 00:00:00 2001 From: lensferno Date: Fri, 24 Feb 2023 22:14:31 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9C=AC=E7=A7=91=E7=94=9F=EF=BC=9A=E8=B0=83?= =?UTF-8?q?=E6=95=B4=E8=AF=BE=E7=A8=8B=E8=A7=A3=E6=9E=90=E7=AE=97=E6=B3=95?= =?UTF-8?q?=EF=BC=9B=E8=B0=83=E6=95=B4=E4=BA=86=E8=AF=BE=E7=A8=8B=E5=AF=B9?= =?UTF-8?q?=E8=B1=A1=E7=9A=84=E6=AF=94=E8=BE=83=E6=9D=A1=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../linghang/mywust/data/global/Course.java | 4 +- .../UndergradCourseTableParser.java | 98 +++++++++++++------ 2 files changed, 70 insertions(+), 32 deletions(-) diff --git a/mywust-common/src/main/java/cn/linghang/mywust/data/global/Course.java b/mywust-common/src/main/java/cn/linghang/mywust/data/global/Course.java index d52ca44..067861a 100644 --- a/mywust-common/src/main/java/cn/linghang/mywust/data/global/Course.java +++ b/mywust-common/src/main/java/cn/linghang/mywust/data/global/Course.java @@ -75,12 +75,12 @@ public class Course { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; Course course = (Course) o; - return startWeek == course.startWeek && endWeek == course.endWeek && weekDay == course.weekDay && startSection == course.startSection && endSection == course.endSection && Objects.equals(teachClass, course.teachClass); + return startWeek == course.startWeek && endWeek == course.endWeek && weekDay == course.weekDay && startSection == course.startSection && endSection == course.endSection && name.equals(course.name) && teacher.equals(course.teacher) && teachClass.equals(course.teachClass); } @Override public int hashCode() { - return Objects.hash(teachClass, startWeek, endWeek, weekDay, startSection, endSection); + return Objects.hash(name, teacher, teachClass, startWeek, endWeek, weekDay, startSection, endSection); } private static Map makeWeekdayMap() { diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/UndergradCourseTableParser.java b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/UndergradCourseTableParser.java index 5425d3c..bd61afc 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/UndergradCourseTableParser.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/UndergradCourseTableParser.java @@ -24,8 +24,21 @@ public class UndergradCourseTableParser implements Parser> { private static final String COURSE_SPLIT_TAG_STR = "
"; - private static final Pattern WEEK_REGEX = Pattern.compile("\\d+"); - + // 用来匹配数字的,位数不限 + private static final Pattern DIGITAL_PATTERN = Pattern.compile("\\d+"); + + // 例:1-17(周)[03-04节]; 1-2,4-7(周)[01-02节] + // 容易看一点的:(?.*?)(周)[(?
.*?)节] + // 提取出来后:week: 1-17, section: 03-04; week: 1-2,4-7, section: 01-02 + private static final Pattern WEEK_SECTION_REGEX = Pattern.compile("(?.*?)\\(周\\)\\[(?
.*?)节]"); + + /** + * 解析课程,可能会有重复的课程,调用者需要手动去重 + * + * @param html 原页面html + * @return 解析好的课程List + * @throws ParseException 解析课表时出现任何问题 + */ @Override public List parse(String html) throws ParseException { try { @@ -40,12 +53,12 @@ public class UndergradCourseTableParser implements Parser> { List courses = new ArrayList<>(girds.size()); - // 遍历每个格子,使用girdCount计数格子来计算节次信息 + // 遍历每个格子,使用girdCount计数格子来计算星期 int girdCount = 0; for (Element gird : girds) { girdCount++; - // 将分隔符替换成标签,方便解析 + // 将分隔符替换成标签,方便重新解析格子 String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR); Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div"); for (Element courseElement : courseElements) { @@ -62,7 +75,7 @@ public class UndergradCourseTableParser implements Parser> { courseBuilder.name(courseName); - // 直接获取格子里所有课程的关键字段,每个下表对应格子里相应的课程 + // 直接获取格子里的关键信息 Elements classElements = courseElement.getElementsByAttributeValue("title", "课堂名称"); Elements teacherElements = courseElement.getElementsByAttributeValue("title", "老师"); Elements timeElements = courseElement.getElementsByAttributeValue("title", "周次(节次)"); @@ -78,31 +91,8 @@ public class UndergradCourseTableParser implements Parser> { int weekDay = girdCount % 7; courseBuilder.weekDay(weekDay == 0 ? 7 : weekDay); - // 靠行位置来确定节次和星期,而不是靠time字段的数据确定(因为太不好处理了) - // 对于只有一个小节的课程,这类课程多数是在线课程,这里一律按照两小节大课处理 - // 具体算法就是行索引x2 + 1就是开始的节次(索引从0开始) - int lineIndex = (girdCount / 7); - courseBuilder.startSection(lineIndex * 2 + 1); - courseBuilder.endSection(lineIndex * 2 + 2); - - // 切割连续周信息,如"1-2,4-6(周)"这样两段的连续周(1-3周和5-10周) - // 不直接使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,效率不太行 - String timeText = StringUtil.split(JsoupUtil.getElementText(timeElements, 0), ',').get(0); - List times = StringUtil.split(timeText, ','); - for (String time : times) { - Matcher weekMatcher = WEEK_REGEX.matcher(time); - // 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming - if (!weekMatcher.find()) { - continue; - } - - // 第二次matcher.find()匹配结束周,如果没有数字匹配说明是单周课程 - int startWeek = Integer.parseInt(weekMatcher.group()); - int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek; - courseBuilder.startWeek(startWeek).endWeek(endWeek); - - courses.add(courseBuilder.build()); - } + String timeText = JsoupUtil.getElementText(timeElements, 0); + this.parseTime(timeText, courseBuilder, courses); } } @@ -113,4 +103,52 @@ public class UndergradCourseTableParser implements Parser> { throw new ParseException("解析课表时出现问题:" + e, html); } } + + /** + * 解析周次和节次时间,主要使用正则解析 + * + * @param timeText 周次节次文本,如:1-17(周)[03-04节]; 1-2,4-7(周)[01-02节] + * @param courseBuilder courseBuilder + * @param courses 课程解析结果列表 + */ + private void parseTime(String timeText, Course.CourseBuilder courseBuilder, List courses) { + Matcher timeMatcher = WEEK_SECTION_REGEX.matcher(timeText); + if (timeMatcher.find()) { + // 解析节次,这种方法相比于通过定位格子来确定节次更准确,但是可能会出现重复的课程 + String sectionString = timeMatcher.group("section"); + Matcher sectionMatcher = DIGITAL_PATTERN.matcher(sectionString); + if (sectionMatcher.find()) { + int startSection = Integer.parseInt(sectionMatcher.group()); + + // 不断匹配下一个数字,直到最后一个,即为结束节次数字,如果第一次就不匹配,则为单节课 + int endSection = startSection; + while (sectionMatcher.find()) { + endSection = Integer.parseInt(sectionMatcher.group()); + } + + courseBuilder.startSection(startSection); + courseBuilder.endSection(endSection); + } + + String weekString = timeMatcher.group("week"); + + // 切割连续周信息,如"1-2,4-6(周)"这样两段的连续周(1-3周和5-10周) + // 不直接使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,效率不太行,但是有一说一,其实可以忽略 + List weekTexts = StringUtil.split(weekString, ','); + for (String weekText : weekTexts) { + Matcher weekMatcher = DIGITAL_PATTERN.matcher(weekText); + // 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming + if (!weekMatcher.find()) { + continue; + } + + // 第二次matcher.find()匹配结束周,如果没有数字匹配说明是单周课程 + int startWeek = Integer.parseInt(weekMatcher.group()); + int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek; + courseBuilder.startWeek(startWeek).endWeek(endWeek); + + courses.add(courseBuilder.build()); + } + } + } }