同步在wusthelper上的改进

- 本科生课表单周时间解析不正确的问题
- 本科生成绩获取判断不正确的问题
old-package
lensfrex 2 years ago
parent f44615c179
commit ec3f252a31
Signed by: lensfrex
GPG Key ID: 0F69A0A2FBEE98A0
  1. 15
      mywust-core/src/main/java/cn/linghang/mywust/core/exception/ParseException.java
  2. 2
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/HuangjiahuClassroomNameParser.java
  3. 2
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsCoursePageParser.java
  4. 2
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsIndexPageParser.java
  5. 78
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/CourseTableParser.java
  6. 35
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/ExamInfoParser.java
  7. 2
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/StudentInfoPageParser.java
  8. 11
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/TrainingPlanPageParser.java
  9. 2
      mywust-test/src/test/java/SchemeTest.java

@ -1,15 +1,24 @@
package cn.linghang.mywust.core.exception; package cn.linghang.mywust.core.exception;
public class ParseException extends BasicException { public class ParseException extends BasicException {
public ParseException() { private final String rawData;
public ParseException(String rawData) {
super("解析数据失败"); super("解析数据失败");
this.rawData = rawData;
} }
public ParseException(String message) { public ParseException(String message, String rawData) {
super(message); super(message);
this.rawData = rawData;
} }
public ParseException(String message, Throwable cause) { public ParseException(String message, Throwable cause, String rawData) {
super(message, cause); super(message, cause);
this.rawData = rawData;
}
public String getRawData() {
return rawData;
} }
} }

@ -47,7 +47,7 @@ public class HuangjiahuClassroomNameParser implements Parser<ClassRoom> {
} }
} catch (Exception e) { } catch (Exception e) {
log.warn("解析教室编号失败,教室:{}", classroomName); log.warn("解析教室编号失败,教室:{}", classroomName);
throw new ParseException(); throw new ParseException(classroomName);
} }
return classRoom; return classRoom;

@ -34,7 +34,7 @@ public class PhysicsCoursePageParser implements Parser<List<PhysicsCourse>> {
public List<PhysicsCourse> parse(String html) throws ParseException { public List<PhysicsCourse> parse(String html) throws ParseException {
Elements courseElements = Jsoup.parse(html).selectXpath(PhysicsCourseXpath.COURSE_ROWS_XPATH); Elements courseElements = Jsoup.parse(html).selectXpath(PhysicsCourseXpath.COURSE_ROWS_XPATH);
if (courseElements.isEmpty()) { if (courseElements.isEmpty()) {
throw new ParseException(); throw new ParseException(html);
} }
List<PhysicsCourse> courses = new ArrayList<>(courseElements.size()); List<PhysicsCourse> courses = new ArrayList<>(courseElements.size());

@ -12,7 +12,7 @@ public class PhysicsIndexPageParser implements Parser<String> {
Document page = Jsoup.parse(html); Document page = Jsoup.parse(html);
Elements linkElements = page.selectXpath(PhysicsIndexXpath.PHYSICS_LINK_XPATH); Elements linkElements = page.selectXpath(PhysicsIndexXpath.PHYSICS_LINK_XPATH);
if (linkElements.isEmpty()) { if (linkElements.isEmpty()) {
throw new ParseException(); throw new ParseException(html);
} }
return linkElements.get(0).attr("href"); return linkElements.get(0).attr("href");

@ -22,9 +22,9 @@ public class CourseTableParser implements Parser<List<Course>> {
private static final String COURSE_SPLIT_TAG_STR = "</div><div>"; private static final String COURSE_SPLIT_TAG_STR = "</div><div>";
private static final Pattern WEEK_RANGE_REGEX = Pattern.compile("(?<startWeek>\\d+)-(?<endWeek>\\d+)\\(周\\)"); private static final Pattern WEEK_RANGE_REGEX = Pattern.compile("(?<startWeek>\\d+)-(?<endWeek>\\d+)");
private static final Pattern SINGLE_WEEK_REGEX = Pattern.compile("(?<week>\\d+)\\(周\\)"); private static final Pattern SINGLE_WEEK_REGEX = Pattern.compile("(?<week>\\d+)");
@Override @Override
public List<Course> parse(String html) throws ParseException { public List<Course> parse(String html) throws ParseException {
@ -40,6 +40,7 @@ public class CourseTableParser implements Parser<List<Course>> {
List<Course> courses = new ArrayList<>(girds.size()); List<Course> courses = new ArrayList<>(girds.size());
// 遍历每个格子,使用girdCount计数格子来计算节次信息
int girdCount = 0; int girdCount = 0;
for (Element gird : girds) { for (Element gird : girds) {
girdCount++; girdCount++;
@ -48,61 +49,74 @@ public class CourseTableParser implements Parser<List<Course>> {
String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR); String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR);
Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div"); Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div");
for (Element courseElement : courseElements) { for (Element courseElement : courseElements) {
String courseName = courseElement.ownText(); Course.CourseBuilder courseBuilder = Course.builder();
// 格子文本为空,说明这个格子没课,直接跳过这个格子就行了 // 格子文本为空,说明这个格子没课,直接跳过这个格子就行了
// 注意,使用这个条件判断时对jsoup版本有要求,在比较旧的版本下gird.ownText()空格子其实并不空,而是有一个空格的
// 在某个版本之后(至少是1.10到1.15之间的某个版本)会自动剔除多余空格(trim()),所以直接这样判断就行了
// 只不过需要注意一下jsoup的版本,太旧的话可能不会起作用,如确需在旧版本上使用请手动trim或加条件
String courseName = courseElement.ownText();
if ("".equals(courseName)) { if ("".equals(courseName)) {
continue; continue;
} }
courseBuilder.name(courseName);
// 直接获取格子里所有课程的关键字段,每个下表对应格子里相应的课程 // 直接获取格子里所有课程的关键字段,每个下表对应格子里相应的课程
Elements classElements = courseElement.getElementsByAttributeValue("title", "课堂名称"); Elements classElements = courseElement.getElementsByAttributeValue("title", "课堂名称");
Elements teacherElements = courseElement.getElementsByAttributeValue("title", "老师"); Elements teacherElements = courseElement.getElementsByAttributeValue("title", "老师");
Elements timeElements = courseElement.getElementsByAttributeValue("title", "周次(节次)"); Elements timeElements = courseElement.getElementsByAttributeValue("title", "周次(节次)");
Elements classroomElements = courseElement.getElementsByAttributeValue("title", "教室"); Elements classroomElements = courseElement.getElementsByAttributeValue("title", "教室");
Course course = new Course(); courseBuilder.teachClass(classElements.isEmpty() ? "" : classElements.get(0).text());
courseBuilder.teacher(teacherElements.isEmpty() ? "" : teacherElements.get(0).text());
course.setName(courseName);
course.setTeachClass(classElements.isEmpty() ? "" : classElements.get(0).text());
course.setTeacher(teacherElements.isEmpty() ? "" : teacherElements.get(0).text());
ClassRoom classRoom = new ClassRoom(); ClassRoom classRoom = new ClassRoom();
classRoom.setRoom(classroomElements.isEmpty() ? "" : classroomElements.get(0).text()); classRoom.setRoom(classroomElements.isEmpty() ? "" : classroomElements.get(0).text());
course.setClassroom(classRoom); courseBuilder.classroom(classRoom);
// 提取周次信息
String time = timeElements.isEmpty() ? "" : timeElements.get(0).text();
Matcher matcher = WEEK_RANGE_REGEX.matcher(time);
if (matcher.find()) {
course.setStartWeek(Integer.parseInt(matcher.group("startWeek")));
course.setEndWeek(Integer.parseInt(matcher.group("endWeek")));
} else {
// 普通匹配不到的话多半就是只有一周的课程
matcher = SINGLE_WEEK_REGEX.matcher(time);
if (matcher.find()) {
course.setStartWeek(Integer.parseInt(matcher.group("week")));
course.setEndWeek(Integer.parseInt(matcher.group("week")));
}
}
// 靠行位置来确定节次,而不是靠time字段的节次数据确定(因为太不好处理了) int weekDay = girdCount % 7;
courseBuilder.weekDay(weekDay == 0 ? 7 : weekDay);
// 靠行位置来确定节次和星期,而不是靠time字段的数据确定(因为太不好处理了)
// 对于只有一个小节的课程,这类课程多数是在线课程,这里一律按照两小节大课处理
// 具体算法就是行索引x2 + 1就是开始的节次(索引从0开始) // 具体算法就是行索引x2 + 1就是开始的节次(索引从0开始)
int lineIndex = (int) (girdCount * 0.142); int lineIndex = (int) (girdCount * 0.142);
course.setStartSection(lineIndex * 2 + 1); courseBuilder.startSection(lineIndex * 2 + 1);
course.setEndSection(lineIndex * 2 + 2); courseBuilder.endSection(lineIndex * 2 + 2);
int weekDay = girdCount % 7; // 提取周次信息,可能会有用","分成两段的周次信息文本
course.setWeekDay(weekDay == 0 ? 7 : weekDay); // 去除后面不需要的节次信息,以免对正则提取产生影响
// 这样做理论上有点浪费性能了,但还行
String timeText = timeElements.isEmpty() ? "" : timeElements.get(0).text().split("\\[")[0];
String[] times = timeText.split(",");
for (String time : times) {
int startWeek = 0;
int endWeek = 0;
Matcher matcher = WEEK_RANGE_REGEX.matcher(time);
if (matcher.find()) {
startWeek = Integer.parseInt(matcher.group("startWeek"));
endWeek = Integer.parseInt(matcher.group("endWeek"));
} else {
// 普通匹配不到的话多半就是只有一周的课程
matcher = SINGLE_WEEK_REGEX.matcher(time);
if (matcher.find()) {
startWeek = Integer.parseInt(matcher.group("week"));
endWeek = Integer.parseInt(matcher.group("week"));
}
}
courses.add(course); courseBuilder.startWeek(startWeek).endWeek(endWeek);
courses.add(courseBuilder.build());
}
} }
} }
return courses; return courses;
} catch (Exception e) { } catch (Exception e) {
log.warn("解析课表时出现问题:{}", e.getMessage(), e); log.warn("解析课表时出现问题:{}", e.getMessage(), e);
throw new ParseException(); throw new ParseException(html);
} }
} }

@ -20,34 +20,37 @@ public class ExamInfoParser implements Parser<List<ExamInfo>> {
public List<ExamInfo> parse(String html) throws ParseException { public List<ExamInfo> parse(String html) throws ParseException {
Elements rows = Jsoup.parse(html).selectXpath(ExamInfoXpath.EXAM_INFO_ROWS_XPATH); Elements rows = Jsoup.parse(html).selectXpath(ExamInfoXpath.EXAM_INFO_ROWS_XPATH);
if (rows.isEmpty()) { if (rows.isEmpty()) {
throw new ParseException(); throw new ParseException(html);
} }
List<ExamInfo> examInfos = new ArrayList<>(rows.size()); List<ExamInfo> examInfos = new ArrayList<>(rows.size());
try { try {
for (Element row : rows) { for (Element row : rows) {
Elements columns = row.getElementsByTag("td"); // 提取出当前行的所有格子
if (columns.size() < 14) { Elements girds = row.getElementsByTag("td");
// 如果这行格子数少于6个,即到了“成绩”的那个格子就没了,那就没啥意义了,直接跳过,不理了
if (girds.size() < 6) {
continue; continue;
} }
ExamInfo examInfo = new ExamInfo(); ExamInfo examInfo = new ExamInfo();
// 这段看着震撼,但其实很丑 // 这段看着震撼,但其实很丑
examInfo.setId(columns.get(0).text()); examInfo.setId(girds.get(0).text());
examInfo.setTerm(columns.get(1).text()); examInfo.setTerm(girds.get(1).text());
examInfo.setCourseNumber(columns.get(2).text()); examInfo.setCourseNumber(girds.get(2).text());
examInfo.setCourseName(columns.get(3).text()); examInfo.setCourseName(girds.get(3).text());
examInfo.setGroupName(columns.get(4).text()); examInfo.setGroupName(girds.get(4).text());
examInfo.setScore(columns.get(5).text()); examInfo.setScore(girds.get(5).text());
examInfo.setFlag(columns.get(6).text()); examInfo.setFlag(girds.get(6).text());
examInfo.setCredit(columns.get(7).text()); examInfo.setCredit(girds.get(7).text());
examInfo.setCourseHours(columns.get(8).text()); examInfo.setCourseHours(girds.get(8).text());
examInfo.setGradePoint(columns.get(9).text()); examInfo.setGradePoint(girds.get(9).text());
examInfo.setEvaluateMethod(columns.get(11).text()); examInfo.setEvaluateMethod(girds.get(11).text());
examInfo.setKind(columns.get(12).text()); examInfo.setKind(girds.get(12).text());
examInfo.setCourseKind(columns.get(13).text()); examInfo.setCourseKind(girds.get(13).text());
examInfos.add(examInfo); examInfos.add(examInfo);
} }

@ -15,7 +15,7 @@ public class StudentInfoPageParser implements Parser<StudentInfo> {
Document page = Jsoup.parse(html); Document page = Jsoup.parse(html);
Element table = page.getElementById("xjkpTable"); Element table = page.getElementById("xjkpTable");
if (table == null) { if (table == null) {
throw new ParseException(); throw new ParseException(html);
} }
Elements studentElements = table.selectXpath(StudentInfoXpath.STUDENT_NUMBER); Elements studentElements = table.selectXpath(StudentInfoXpath.STUDENT_NUMBER);

@ -4,16 +4,19 @@ import cn.linghang.mywust.core.exception.ParseException;
import cn.linghang.mywust.core.parser.Parser; import cn.linghang.mywust.core.parser.Parser;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class TrainingPlanPageParser implements Parser<String> { public class TrainingPlanPageParser implements Parser<String> {
@Override @Override
public String parse(String html) throws ParseException { public String parse(String html) throws ParseException {
Element trainingPlanElement = Jsoup.parse(html).getElementById("dataList"); Elements trainingPlanElement = Jsoup.parse(html).selectXpath("/html/body/div/div/form[1]");
if (trainingPlanElement == null) { if (trainingPlanElement.isEmpty()) {
throw new ParseException("教学方案html解析提取失败,id为dataList的元素不存在"); throw new ParseException("教学方案html解析提取失败,id为dataList的元素不存在", html);
} }
return trainingPlanElement.outerHtml(); // 有极少部分19级的学生培养方案页面错乱,中间某部分会被挪到最上边,直接使用id为dataList的表格提取会导致缺失部分信息
// 在找到更好的解析处理方式之前,此处不对顺序进行处理,直接原样返回
return trainingPlanElement.get(0).outerHtml();
} }
} }

@ -27,7 +27,7 @@ public class SchemeTest {
RequestClientOption.Proxy proxy = new RequestClientOption.Proxy(); RequestClientOption.Proxy proxy = new RequestClientOption.Proxy();
proxy.setPort(6060); proxy.setPort(6060);
proxy.setAddress("127.0.0.1"); proxy.setAddress("127.0.0.1");
option.setProxy(proxy); option.setProxy(null);
option.setFallowUrlRedirect(false); option.setFallowUrlRedirect(false);
Requester requester = new SimpleOkhttpRequester(); Requester requester = new SimpleOkhttpRequester();

Loading…
Cancel
Save