同步在wusthelper上的改进

- 本科生课表单周时间解析不正确的问题 - 本科生成绩获取判断不正确的问题
3 years ago · ec3f252a31
parent f44615c179
commit ec3f252a31
9 changed files with 89 additions and 60 deletions
--- a/mywust-core/src/main/java/cn/linghang/mywust/core/exception/ParseException.java
+++ b/mywust-core/src/main/java/cn/linghang/mywust/core/exception/ParseException.java
@ -1,15 +1,24 @@
 package cn.linghang.mywust.core.exception;

 public class ParseException extends BasicException {
-    public ParseException() {
+    private final String rawData;
+
+    public ParseException(String rawData) {
        super("解析数据失败");
+        this.rawData = rawData;
    }

-    public ParseException(String message) {
+    public ParseException(String message, String rawData) {
        super(message);
+        this.rawData = rawData;
    }

-    public ParseException(String message, Throwable cause) {
+    public ParseException(String message, Throwable cause, String rawData) {
        super(message, cause);
+        this.rawData = rawData;
+    }
+
+    public String getRawData() {
+        return rawData;
    }
 }
--- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/HuangjiahuClassroomNameParser.java
+++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/HuangjiahuClassroomNameParser.java
@ -47,7 +47,7 @@ public class HuangjiahuClassroomNameParser implements Parser<ClassRoom> {
            }
        } catch (Exception e) {
            log.warn("解析教室编号失败，教室：{}", classroomName);
-            throw new ParseException();
+            throw new ParseException(classroomName);
        }

        return classRoom;
--- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsCoursePageParser.java
+++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsCoursePageParser.java
@ -34,7 +34,7 @@ public class PhysicsCoursePageParser implements Parser<List<PhysicsCourse>> {
    public List<PhysicsCourse> parse(String html) throws ParseException {
        Elements courseElements = Jsoup.parse(html).selectXpath(PhysicsCourseXpath.COURSE_ROWS_XPATH);
        if (courseElements.isEmpty()) {
-            throw new ParseException();
+            throw new ParseException(html);
        }

        List<PhysicsCourse> courses = new ArrayList<>(courseElements.size());
--- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsIndexPageParser.java
+++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsIndexPageParser.java
@ -12,7 +12,7 @@ public class PhysicsIndexPageParser implements Parser<String> {
        Document page = Jsoup.parse(html);
        Elements linkElements = page.selectXpath(PhysicsIndexXpath.PHYSICS_LINK_XPATH);
        if (linkElements.isEmpty()) {
-            throw new ParseException();
+            throw new ParseException(html);
        }

        return linkElements.get(0).attr("href");
--- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/CourseTableParser.java
+++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/CourseTableParser.java
@ -22,9 +22,9 @@ public class CourseTableParser implements Parser<List<Course>> {

    private static final String COURSE_SPLIT_TAG_STR = "</div><div>";

-    private static final Pattern WEEK_RANGE_REGEX = Pattern.compile("(?<startWeek>\\d+)-(?<endWeek>\\d+)\\(周\\)");
+    private static final Pattern WEEK_RANGE_REGEX = Pattern.compile("(?<startWeek>\\d+)-(?<endWeek>\\d+)");

-    private static final Pattern SINGLE_WEEK_REGEX = Pattern.compile("(?<week>\\d+)\\(周\\)");
+    private static final Pattern SINGLE_WEEK_REGEX = Pattern.compile("(?<week>\\d+)");

    @Override
    public List<Course> parse(String html) throws ParseException {
@ -40,6 +40,7 @@ public class CourseTableParser implements Parser<List<Course>> {

            List<Course> courses = new ArrayList<>(girds.size());

+            // 遍历每个格子，使用girdCount计数格子来计算节次信息
            int girdCount = 0;
            for (Element gird : girds) {
                girdCount++;
@ -48,61 +49,74 @@ public class CourseTableParser implements Parser<List<Course>> {
                String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR);
                Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div");
                for (Element courseElement : courseElements) {
-                    String courseName = courseElement.ownText();
+                    Course.CourseBuilder courseBuilder = Course.builder();

                    // 格子文本为空，说明这个格子没课，直接跳过这个格子就行了
+                    // 注意，使用这个条件判断时对jsoup版本有要求，在比较旧的版本下gird.ownText()空格子其实并不空，而是有一个空格的
+                    // 在某个版本之后（至少是1.10到1.15之间的某个版本）会自动剔除多余空格（trim()），所以直接这样判断就行了
+                    // 只不过需要注意一下jsoup的版本，太旧的话可能不会起作用，如确需在旧版本上使用请手动trim或加条件
+                    String courseName = courseElement.ownText();
                    if ("".equals(courseName)) {
                        continue;
                    }

+                    courseBuilder.name(courseName);
+
                    // 直接获取格子里所有课程的关键字段，每个下表对应格子里相应的课程
                    Elements classElements = courseElement.getElementsByAttributeValue("title", "课堂名称");
                    Elements teacherElements = courseElement.getElementsByAttributeValue("title", "老师");
                    Elements timeElements = courseElement.getElementsByAttributeValue("title", "周次(节次)");
                    Elements classroomElements = courseElement.getElementsByAttributeValue("title", "教室");

-                    Course course = new Course();
-
-                    course.setName(courseName);
-                    course.setTeachClass(classElements.isEmpty() ? "" : classElements.get(0).text());
-                    course.setTeacher(teacherElements.isEmpty() ? "" : teacherElements.get(0).text());
+                    courseBuilder.teachClass(classElements.isEmpty() ? "" : classElements.get(0).text());
+                    courseBuilder.teacher(teacherElements.isEmpty() ? "" : teacherElements.get(0).text());

                    ClassRoom classRoom = new ClassRoom();
                    classRoom.setRoom(classroomElements.isEmpty() ? "" : classroomElements.get(0).text());
-                    course.setClassroom(classRoom);
-
-                    // 提取周次信息
-                    String time = timeElements.isEmpty() ? "" : timeElements.get(0).text();
-                    Matcher matcher = WEEK_RANGE_REGEX.matcher(time);
-                    if (matcher.find()) {
-                        course.setStartWeek(Integer.parseInt(matcher.group("startWeek")));
-                        course.setEndWeek(Integer.parseInt(matcher.group("endWeek")));
-                    } else {
-                        // 普通匹配不到的话多半就是只有一周的课程
-                        matcher = SINGLE_WEEK_REGEX.matcher(time);
-                        if (matcher.find()) {
-                            course.setStartWeek(Integer.parseInt(matcher.group("week")));
-                            course.setEndWeek(Integer.parseInt(matcher.group("week")));
-                        }
-                    }
+                    courseBuilder.classroom(classRoom);

-                    // 靠行位置来确定节次，而不是靠time字段的节次数据确定（因为太不好处理了）
+                    int weekDay = girdCount % 7;
+                    courseBuilder.weekDay(weekDay == 0 ? 7 : weekDay);
+
+                    // 靠行位置来确定节次和星期，而不是靠time字段的数据确定（因为太不好处理了）
+                    // 对于只有一个小节的课程，这类课程多数是在线课程，这里一律按照两小节大课处理
                    // 具体算法就是行索引x2 + 1就是开始的节次（索引从0开始）
                    int lineIndex = (int) (girdCount * 0.142);
-                    course.setStartSection(lineIndex * 2 + 1);
-                    course.setEndSection(lineIndex * 2 + 2);
-
-                    int weekDay = girdCount % 7;
-                    course.setWeekDay(weekDay == 0 ? 7 : weekDay);
+                    courseBuilder.startSection(lineIndex * 2 + 1);
+                    courseBuilder.endSection(lineIndex * 2 + 2);
+
+                    // 提取周次信息，可能会有用","分成两段的周次信息文本
+                    // 去除后面不需要的节次信息，以免对正则提取产生影响
+                    // 这样做理论上有点浪费性能了，但还行
+                    String timeText = timeElements.isEmpty() ? "" : timeElements.get(0).text().split("\\[")[0];
+                    String[] times = timeText.split(",");
+                    for (String time : times) {
+                        int startWeek = 0;
+                        int endWeek = 0;
+
+                        Matcher matcher = WEEK_RANGE_REGEX.matcher(time);
+                        if (matcher.find()) {
+                            startWeek = Integer.parseInt(matcher.group("startWeek"));
+                            endWeek = Integer.parseInt(matcher.group("endWeek"));
+                        } else {
+                            // 普通匹配不到的话多半就是只有一周的课程
+                            matcher = SINGLE_WEEK_REGEX.matcher(time);
+                            if (matcher.find()) {
+                                startWeek = Integer.parseInt(matcher.group("week"));
+                                endWeek = Integer.parseInt(matcher.group("week"));
+                            }
+                        }

-                    courses.add(course);
+                        courseBuilder.startWeek(startWeek).endWeek(endWeek);
+                        courses.add(courseBuilder.build());
+                    }
                }
            }

            return courses;
        } catch (Exception e) {
            log.warn("解析课表时出现问题：{}", e.getMessage(), e);
-            throw new ParseException();
+            throw new ParseException(html);
        }

    }
--- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/ExamInfoParser.java
+++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/ExamInfoParser.java
@ -20,34 +20,37 @@ public class ExamInfoParser implements Parser<List<ExamInfo>> {
    public List<ExamInfo> parse(String html) throws ParseException {
        Elements rows = Jsoup.parse(html).selectXpath(ExamInfoXpath.EXAM_INFO_ROWS_XPATH);
        if (rows.isEmpty()) {
-            throw new ParseException();
+            throw new ParseException(html);
        }

        List<ExamInfo> examInfos = new ArrayList<>(rows.size());

        try {
            for (Element row : rows) {
-                Elements columns = row.getElementsByTag("td");
-                if (columns.size() < 14) {
+                // 提取出当前行的所有格子
+                Elements girds = row.getElementsByTag("td");
+
+                // 如果这行格子数少于6个，即到了“成绩”的那个格子就没了，那就没啥意义了，直接跳过，不理了
+                if (girds.size() < 6) {
                    continue;
                }

                ExamInfo examInfo = new ExamInfo();

                // 这段看着震撼，但其实很丑
-                examInfo.setId(columns.get(0).text());
-                examInfo.setTerm(columns.get(1).text());
-                examInfo.setCourseNumber(columns.get(2).text());
-                examInfo.setCourseName(columns.get(3).text());
-                examInfo.setGroupName(columns.get(4).text());
-                examInfo.setScore(columns.get(5).text());
-                examInfo.setFlag(columns.get(6).text());
-                examInfo.setCredit(columns.get(7).text());
-                examInfo.setCourseHours(columns.get(8).text());
-                examInfo.setGradePoint(columns.get(9).text());
-                examInfo.setEvaluateMethod(columns.get(11).text());
-                examInfo.setKind(columns.get(12).text());
-                examInfo.setCourseKind(columns.get(13).text());
+                examInfo.setId(girds.get(0).text());
+                examInfo.setTerm(girds.get(1).text());
+                examInfo.setCourseNumber(girds.get(2).text());
+                examInfo.setCourseName(girds.get(3).text());
+                examInfo.setGroupName(girds.get(4).text());
+                examInfo.setScore(girds.get(5).text());
+                examInfo.setFlag(girds.get(6).text());
+                examInfo.setCredit(girds.get(7).text());
+                examInfo.setCourseHours(girds.get(8).text());
+                examInfo.setGradePoint(girds.get(9).text());
+                examInfo.setEvaluateMethod(girds.get(11).text());
+                examInfo.setKind(girds.get(12).text());
+                examInfo.setCourseKind(girds.get(13).text());

                examInfos.add(examInfo);
            }
--- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/StudentInfoPageParser.java
+++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/StudentInfoPageParser.java
@ -15,7 +15,7 @@ public class StudentInfoPageParser implements Parser<StudentInfo> {
        Document page = Jsoup.parse(html);
        Element table = page.getElementById("xjkpTable");
        if (table == null) {
-            throw new ParseException();
+            throw new ParseException(html);
        }

        Elements studentElements = table.selectXpath(StudentInfoXpath.STUDENT_NUMBER);
--- a/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/TrainingPlanPageParser.java
+++ b/mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/TrainingPlanPageParser.java
@ -4,16 +4,19 @@ import cn.linghang.mywust.core.exception.ParseException;
 import cn.linghang.mywust.core.parser.Parser;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;

 public class TrainingPlanPageParser implements Parser<String> {

    @Override
    public String parse(String html) throws ParseException {
-        Element trainingPlanElement = Jsoup.parse(html).getElementById("dataList");
-        if (trainingPlanElement == null) {
-            throw new ParseException("教学方案html解析提取失败，id为dataList的元素不存在");
+        Elements trainingPlanElement = Jsoup.parse(html).selectXpath("/html/body/div/div/form[1]");
+        if (trainingPlanElement.isEmpty()) {
+            throw new ParseException("教学方案html解析提取失败，id为dataList的元素不存在", html);
        }

-        return trainingPlanElement.outerHtml();
+        // 有极少部分19级的学生培养方案页面错乱，中间某部分会被挪到最上边，直接使用id为dataList的表格提取会导致缺失部分信息
+        // 在找到更好的解析处理方式之前，此处不对顺序进行处理，直接原样返回
+        return trainingPlanElement.get(0).outerHtml();
    }
 }
--- a/mywust-test/src/test/java/SchemeTest.java
+++ b/mywust-test/src/test/java/SchemeTest.java
@ -27,7 +27,7 @@ public class SchemeTest {
        RequestClientOption.Proxy proxy = new RequestClientOption.Proxy();
        proxy.setPort(6060);
        proxy.setAddress("127.0.0.1");
-        option.setProxy(proxy);
+        option.setProxy(null);
        option.setFallowUrlRedirect(false);

        Requester requester = new SimpleOkhttpRequester();