diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/exception/ParseException.java b/mywust-core/src/main/java/cn/linghang/mywust/core/exception/ParseException.java index a671ce5..f3e0c50 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/exception/ParseException.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/exception/ParseException.java @@ -1,15 +1,24 @@ package cn.linghang.mywust.core.exception; public class ParseException extends BasicException { - public ParseException() { + private final String rawData; + + public ParseException(String rawData) { super("解析数据失败"); + this.rawData = rawData; } - public ParseException(String message) { + public ParseException(String message, String rawData) { super(message); + this.rawData = rawData; } - public ParseException(String message, Throwable cause) { + public ParseException(String message, Throwable cause, String rawData) { super(message, cause); + this.rawData = rawData; + } + + public String getRawData() { + return rawData; } } diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/HuangjiahuClassroomNameParser.java b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/HuangjiahuClassroomNameParser.java index 002ffd6..63ac90d 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/HuangjiahuClassroomNameParser.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/HuangjiahuClassroomNameParser.java @@ -47,7 +47,7 @@ public class HuangjiahuClassroomNameParser implements Parser { } } catch (Exception e) { log.warn("解析教室编号失败,教室:{}", classroomName); - throw new ParseException(); + throw new ParseException(classroomName); } return classRoom; diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsCoursePageParser.java b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsCoursePageParser.java index da3d37c..23b24bc 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsCoursePageParser.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsCoursePageParser.java @@ -34,7 +34,7 @@ public class PhysicsCoursePageParser implements Parser> { public List parse(String html) throws ParseException { Elements courseElements = Jsoup.parse(html).selectXpath(PhysicsCourseXpath.COURSE_ROWS_XPATH); if (courseElements.isEmpty()) { - throw new ParseException(); + throw new ParseException(html); } List courses = new ArrayList<>(courseElements.size()); diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsIndexPageParser.java b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsIndexPageParser.java index 76cb4a4..a6dc159 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsIndexPageParser.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsIndexPageParser.java @@ -12,7 +12,7 @@ public class PhysicsIndexPageParser implements Parser { Document page = Jsoup.parse(html); Elements linkElements = page.selectXpath(PhysicsIndexXpath.PHYSICS_LINK_XPATH); if (linkElements.isEmpty()) { - throw new ParseException(); + throw new ParseException(html); } return linkElements.get(0).attr("href"); diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/CourseTableParser.java b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/CourseTableParser.java index 7b8819c..ebe7f5f 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/CourseTableParser.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/CourseTableParser.java @@ -22,9 +22,9 @@ public class CourseTableParser implements Parser> { private static final String COURSE_SPLIT_TAG_STR = "
"; - private static final Pattern WEEK_RANGE_REGEX = Pattern.compile("(?\\d+)-(?\\d+)\\(周\\)"); + private static final Pattern WEEK_RANGE_REGEX = Pattern.compile("(?\\d+)-(?\\d+)"); - private static final Pattern SINGLE_WEEK_REGEX = Pattern.compile("(?\\d+)\\(周\\)"); + private static final Pattern SINGLE_WEEK_REGEX = Pattern.compile("(?\\d+)"); @Override public List parse(String html) throws ParseException { @@ -40,6 +40,7 @@ public class CourseTableParser implements Parser> { List courses = new ArrayList<>(girds.size()); + // 遍历每个格子,使用girdCount计数格子来计算节次信息 int girdCount = 0; for (Element gird : girds) { girdCount++; @@ -48,61 +49,74 @@ public class CourseTableParser implements Parser> { String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR); Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div"); for (Element courseElement : courseElements) { - String courseName = courseElement.ownText(); + Course.CourseBuilder courseBuilder = Course.builder(); // 格子文本为空,说明这个格子没课,直接跳过这个格子就行了 + // 注意,使用这个条件判断时对jsoup版本有要求,在比较旧的版本下gird.ownText()空格子其实并不空,而是有一个空格的 + // 在某个版本之后(至少是1.10到1.15之间的某个版本)会自动剔除多余空格(trim()),所以直接这样判断就行了 + // 只不过需要注意一下jsoup的版本,太旧的话可能不会起作用,如确需在旧版本上使用请手动trim或加条件 + String courseName = courseElement.ownText(); if ("".equals(courseName)) { continue; } + courseBuilder.name(courseName); + // 直接获取格子里所有课程的关键字段,每个下表对应格子里相应的课程 Elements classElements = courseElement.getElementsByAttributeValue("title", "课堂名称"); Elements teacherElements = courseElement.getElementsByAttributeValue("title", "老师"); Elements timeElements = courseElement.getElementsByAttributeValue("title", "周次(节次)"); Elements classroomElements = courseElement.getElementsByAttributeValue("title", "教室"); - Course course = new Course(); - - course.setName(courseName); - course.setTeachClass(classElements.isEmpty() ? "" : classElements.get(0).text()); - course.setTeacher(teacherElements.isEmpty() ? "" : teacherElements.get(0).text()); + courseBuilder.teachClass(classElements.isEmpty() ? "" : classElements.get(0).text()); + courseBuilder.teacher(teacherElements.isEmpty() ? "" : teacherElements.get(0).text()); ClassRoom classRoom = new ClassRoom(); classRoom.setRoom(classroomElements.isEmpty() ? "" : classroomElements.get(0).text()); - course.setClassroom(classRoom); - - // 提取周次信息 - String time = timeElements.isEmpty() ? "" : timeElements.get(0).text(); - Matcher matcher = WEEK_RANGE_REGEX.matcher(time); - if (matcher.find()) { - course.setStartWeek(Integer.parseInt(matcher.group("startWeek"))); - course.setEndWeek(Integer.parseInt(matcher.group("endWeek"))); - } else { - // 普通匹配不到的话多半就是只有一周的课程 - matcher = SINGLE_WEEK_REGEX.matcher(time); - if (matcher.find()) { - course.setStartWeek(Integer.parseInt(matcher.group("week"))); - course.setEndWeek(Integer.parseInt(matcher.group("week"))); - } - } + courseBuilder.classroom(classRoom); - // 靠行位置来确定节次,而不是靠time字段的节次数据确定(因为太不好处理了) + int weekDay = girdCount % 7; + courseBuilder.weekDay(weekDay == 0 ? 7 : weekDay); + + // 靠行位置来确定节次和星期,而不是靠time字段的数据确定(因为太不好处理了) + // 对于只有一个小节的课程,这类课程多数是在线课程,这里一律按照两小节大课处理 // 具体算法就是行索引x2 + 1就是开始的节次(索引从0开始) int lineIndex = (int) (girdCount * 0.142); - course.setStartSection(lineIndex * 2 + 1); - course.setEndSection(lineIndex * 2 + 2); - - int weekDay = girdCount % 7; - course.setWeekDay(weekDay == 0 ? 7 : weekDay); + courseBuilder.startSection(lineIndex * 2 + 1); + courseBuilder.endSection(lineIndex * 2 + 2); + + // 提取周次信息,可能会有用","分成两段的周次信息文本 + // 去除后面不需要的节次信息,以免对正则提取产生影响 + // 这样做理论上有点浪费性能了,但还行 + String timeText = timeElements.isEmpty() ? "" : timeElements.get(0).text().split("\\[")[0]; + String[] times = timeText.split(","); + for (String time : times) { + int startWeek = 0; + int endWeek = 0; + + Matcher matcher = WEEK_RANGE_REGEX.matcher(time); + if (matcher.find()) { + startWeek = Integer.parseInt(matcher.group("startWeek")); + endWeek = Integer.parseInt(matcher.group("endWeek")); + } else { + // 普通匹配不到的话多半就是只有一周的课程 + matcher = SINGLE_WEEK_REGEX.matcher(time); + if (matcher.find()) { + startWeek = Integer.parseInt(matcher.group("week")); + endWeek = Integer.parseInt(matcher.group("week")); + } + } - courses.add(course); + courseBuilder.startWeek(startWeek).endWeek(endWeek); + courses.add(courseBuilder.build()); + } } } return courses; } catch (Exception e) { log.warn("解析课表时出现问题:{}", e.getMessage(), e); - throw new ParseException(); + throw new ParseException(html); } } diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/ExamInfoParser.java b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/ExamInfoParser.java index 0afa40e..8a3828a 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/ExamInfoParser.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/ExamInfoParser.java @@ -20,34 +20,37 @@ public class ExamInfoParser implements Parser> { public List parse(String html) throws ParseException { Elements rows = Jsoup.parse(html).selectXpath(ExamInfoXpath.EXAM_INFO_ROWS_XPATH); if (rows.isEmpty()) { - throw new ParseException(); + throw new ParseException(html); } List examInfos = new ArrayList<>(rows.size()); try { for (Element row : rows) { - Elements columns = row.getElementsByTag("td"); - if (columns.size() < 14) { + // 提取出当前行的所有格子 + Elements girds = row.getElementsByTag("td"); + + // 如果这行格子数少于6个,即到了“成绩”的那个格子就没了,那就没啥意义了,直接跳过,不理了 + if (girds.size() < 6) { continue; } ExamInfo examInfo = new ExamInfo(); // 这段看着震撼,但其实很丑 - examInfo.setId(columns.get(0).text()); - examInfo.setTerm(columns.get(1).text()); - examInfo.setCourseNumber(columns.get(2).text()); - examInfo.setCourseName(columns.get(3).text()); - examInfo.setGroupName(columns.get(4).text()); - examInfo.setScore(columns.get(5).text()); - examInfo.setFlag(columns.get(6).text()); - examInfo.setCredit(columns.get(7).text()); - examInfo.setCourseHours(columns.get(8).text()); - examInfo.setGradePoint(columns.get(9).text()); - examInfo.setEvaluateMethod(columns.get(11).text()); - examInfo.setKind(columns.get(12).text()); - examInfo.setCourseKind(columns.get(13).text()); + examInfo.setId(girds.get(0).text()); + examInfo.setTerm(girds.get(1).text()); + examInfo.setCourseNumber(girds.get(2).text()); + examInfo.setCourseName(girds.get(3).text()); + examInfo.setGroupName(girds.get(4).text()); + examInfo.setScore(girds.get(5).text()); + examInfo.setFlag(girds.get(6).text()); + examInfo.setCredit(girds.get(7).text()); + examInfo.setCourseHours(girds.get(8).text()); + examInfo.setGradePoint(girds.get(9).text()); + examInfo.setEvaluateMethod(girds.get(11).text()); + examInfo.setKind(girds.get(12).text()); + examInfo.setCourseKind(girds.get(13).text()); examInfos.add(examInfo); } diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/StudentInfoPageParser.java b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/StudentInfoPageParser.java index 7338a86..4da6c40 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/StudentInfoPageParser.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/StudentInfoPageParser.java @@ -15,7 +15,7 @@ public class StudentInfoPageParser implements Parser { Document page = Jsoup.parse(html); Element table = page.getElementById("xjkpTable"); if (table == null) { - throw new ParseException(); + throw new ParseException(html); } Elements studentElements = table.selectXpath(StudentInfoXpath.STUDENT_NUMBER); diff --git a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/TrainingPlanPageParser.java b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/TrainingPlanPageParser.java index 9aa2f57..25510cb 100644 --- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/TrainingPlanPageParser.java +++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/TrainingPlanPageParser.java @@ -4,16 +4,19 @@ import cn.linghang.mywust.core.exception.ParseException; import cn.linghang.mywust.core.parser.Parser; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; public class TrainingPlanPageParser implements Parser { @Override public String parse(String html) throws ParseException { - Element trainingPlanElement = Jsoup.parse(html).getElementById("dataList"); - if (trainingPlanElement == null) { - throw new ParseException("教学方案html解析提取失败,id为dataList的元素不存在"); + Elements trainingPlanElement = Jsoup.parse(html).selectXpath("/html/body/div/div/form[1]"); + if (trainingPlanElement.isEmpty()) { + throw new ParseException("教学方案html解析提取失败,id为dataList的元素不存在", html); } - return trainingPlanElement.outerHtml(); + // 有极少部分19级的学生培养方案页面错乱,中间某部分会被挪到最上边,直接使用id为dataList的表格提取会导致缺失部分信息 + // 在找到更好的解析处理方式之前,此处不对顺序进行处理,直接原样返回 + return trainingPlanElement.get(0).outerHtml(); } } diff --git a/mywust-test/src/test/java/SchemeTest.java b/mywust-test/src/test/java/SchemeTest.java index 89b7584..90bb7f2 100644 --- a/mywust-test/src/test/java/SchemeTest.java +++ b/mywust-test/src/test/java/SchemeTest.java @@ -27,7 +27,7 @@ public class SchemeTest { RequestClientOption.Proxy proxy = new RequestClientOption.Proxy(); proxy.setPort(6060); proxy.setAddress("127.0.0.1"); - option.setProxy(proxy); + option.setProxy(null); option.setFallowUrlRedirect(false); Requester requester = new SimpleOkhttpRequester();