新增本科生全校课表和教师课表解析;优化课表时间字段解析

old-package
lensfrex 2 years ago
parent 29e482943b
commit 0822796201
Signed by: lensfrex
GPG Key ID: 947ADABD8533C476
  1. 8
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/HuangjiahuClassroomNameParser.java
  2. 4
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/graduate/GraduateCourseTableParser.java
  3. 4
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/physics/PhysicsCoursePageParser.java
  4. 43
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/UndergradCourseTableParser.java
  5. 146
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/global/GlobalCourseTableParser.java
  6. 22
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/global/UndergradAllCourseScheduleParser.java
  7. 22
      mywust-core/src/main/java/cn/linghang/mywust/core/parser/undergraduate/global/UndergradTeacherCourseParser.java
  8. 2
      mywust-core/src/main/java/cn/linghang/mywust/core/request/undergrade/global/BkjxAllCourseRequestFactory.java
  9. 2
      mywust-model/src/main/java/cn/linghang/mywust/model/global/Classroom.java
  10. 16
      mywust-model/src/main/java/cn/linghang/mywust/model/global/Course.java
  11. 19
      mywust-util/src/main/java/cn/linghang/mywust/util/StringUtil.java

@ -1,7 +1,7 @@
package cn.linghang.mywust.core.parser; package cn.linghang.mywust.core.parser;
import cn.linghang.mywust.core.exception.ParseException; import cn.linghang.mywust.core.exception.ParseException;
import cn.linghang.mywust.model.global.ClassRoom; import cn.linghang.mywust.model.global.Classroom;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -15,7 +15,7 @@ import java.util.regex.Pattern;
* @author lensfrex * @author lensfrex
* @create 2022-10-26 08:56 * @create 2022-10-26 08:56
*/ */
public class HuangjiahuClassroomNameParser implements Parser<ClassRoom> { public class HuangjiahuClassroomNameParser implements Parser<Classroom> {
private static final Logger log = LoggerFactory.getLogger(HuangjiahuClassroomNameParser.class); private static final Logger log = LoggerFactory.getLogger(HuangjiahuClassroomNameParser.class);
private static final Pattern CLASSROOM_PATTERN = Pattern.compile("(?<buildingId>\\d)(?<areaId>\\d)(?<room>\\d{3})"); private static final Pattern CLASSROOM_PATTERN = Pattern.compile("(?<buildingId>\\d)(?<areaId>\\d)(?<room>\\d{3})");
@ -23,8 +23,8 @@ public class HuangjiahuClassroomNameParser implements Parser<ClassRoom> {
private static final Pattern BUILDING_11_CLASSROOM_PATTERN = Pattern.compile("11(?<areaId>[A-C])(?<room>\\d{3})"); private static final Pattern BUILDING_11_CLASSROOM_PATTERN = Pattern.compile("11(?<areaId>[A-C])(?<room>\\d{3})");
@Override @Override
public ClassRoom parse(String classroomName) throws ParseException { public Classroom parse(String classroomName) throws ParseException {
ClassRoom classRoom = ClassRoom.builder().campus("黄家湖").build(); Classroom classRoom = Classroom.builder().campus("黄家湖").build();
try { try {
Matcher matcher = CLASSROOM_PATTERN.matcher(classroomName); Matcher matcher = CLASSROOM_PATTERN.matcher(classroomName);
// 不匹配普通教学楼正则的多半就是教11的教室 // 不匹配普通教学楼正则的多半就是教11的教室

@ -2,7 +2,7 @@ package cn.linghang.mywust.core.parser.graduate;
import cn.linghang.mywust.core.exception.ParseException; import cn.linghang.mywust.core.exception.ParseException;
import cn.linghang.mywust.core.parser.Parser; import cn.linghang.mywust.core.parser.Parser;
import cn.linghang.mywust.model.global.ClassRoom; import cn.linghang.mywust.model.global.Classroom;
import cn.linghang.mywust.model.global.Course; import cn.linghang.mywust.model.global.Course;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -45,7 +45,7 @@ public class GraduateCourseTableParser implements Parser<List<Course>> {
courseBuilder.teachClass(teachClass); courseBuilder.teachClass(teachClass);
String classroom = matcher.group("classRoom"); String classroom = matcher.group("classRoom");
courseBuilder.classroom(new ClassRoom("", "", "", classroom)); courseBuilder.classroom(new Classroom("", "", "", classroom));
String teacher = matcher.group("teacher"); String teacher = matcher.group("teacher");
courseBuilder.teacher(teacher); courseBuilder.teacher(teacher);

@ -3,7 +3,7 @@ package cn.linghang.mywust.core.parser.physics;
import cn.linghang.mywust.core.exception.ParseException; import cn.linghang.mywust.core.exception.ParseException;
import cn.linghang.mywust.core.parser.HuangjiahuClassroomNameParser; import cn.linghang.mywust.core.parser.HuangjiahuClassroomNameParser;
import cn.linghang.mywust.core.parser.Parser; import cn.linghang.mywust.core.parser.Parser;
import cn.linghang.mywust.model.global.ClassRoom; import cn.linghang.mywust.model.global.Classroom;
import cn.linghang.mywust.model.physics.PhysicsCourse; import cn.linghang.mywust.model.physics.PhysicsCourse;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
@ -47,7 +47,7 @@ public class PhysicsCoursePageParser implements Parser<List<PhysicsCourse>> {
course.setTeacher(columnContextElements.get(3).text().replace('\uE863', '䶮')); course.setTeacher(columnContextElements.get(3).text().replace('\uE863', '䶮'));
String classroomNumber = columnContextElements.get(5).text(); String classroomNumber = columnContextElements.get(5).text();
ClassRoom classRoom = HUANGJIAHU_CLASSROOM_NAME_PARSER.parse(classroomNumber); Classroom classRoom = HUANGJIAHU_CLASSROOM_NAME_PARSER.parse(classroomNumber);
course.setClassroom(classRoom); course.setClassroom(classRoom);
String time = columnContextElements.get(4).text(); String time = columnContextElements.get(4).text();

@ -3,8 +3,9 @@ package cn.linghang.mywust.core.parser.undergraduate;
import cn.linghang.mywust.core.exception.ParseException; import cn.linghang.mywust.core.exception.ParseException;
import cn.linghang.mywust.core.parser.Parser; import cn.linghang.mywust.core.parser.Parser;
import cn.linghang.mywust.core.util.JsoupUtil; import cn.linghang.mywust.core.util.JsoupUtil;
import cn.linghang.mywust.model.global.ClassRoom; import cn.linghang.mywust.model.global.Classroom;
import cn.linghang.mywust.model.global.Course; import cn.linghang.mywust.model.global.Course;
import cn.linghang.mywust.util.StringUtil;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
@ -23,9 +24,7 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
private static final String COURSE_SPLIT_TAG_STR = "</div><div>"; private static final String COURSE_SPLIT_TAG_STR = "</div><div>";
private static final Pattern WEEK_RANGE_REGEX = Pattern.compile("(?<startWeek>\\d+)-(?<endWeek>\\d+)"); private static final Pattern WEEK_REGEX = Pattern.compile("\\d+");
private static final Pattern SINGLE_WEEK_REGEX = Pattern.compile("(?<week>\\d+)");
@Override @Override
public List<Course> parse(String html) throws ParseException { public List<Course> parse(String html) throws ParseException {
@ -50,8 +49,6 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR); String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR);
Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div"); Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div");
for (Element courseElement : courseElements) { for (Element courseElement : courseElements) {
Course.CourseBuilder courseBuilder = Course.builder();
// 格子文本为空,说明这个格子没课,直接跳过这个格子就行了 // 格子文本为空,说明这个格子没课,直接跳过这个格子就行了
// 注意,使用这个条件判断时对jsoup版本有要求,在比较旧的版本下gird.ownText()空格子其实并不空,而是有一个空格的 // 注意,使用这个条件判断时对jsoup版本有要求,在比较旧的版本下gird.ownText()空格子其实并不空,而是有一个空格的
// 在某个版本之后(至少是1.10到1.15之间的某个版本)会自动剔除多余空格(trim()),所以直接这样判断就行了 // 在某个版本之后(至少是1.10到1.15之间的某个版本)会自动剔除多余空格(trim()),所以直接这样判断就行了
@ -61,6 +58,8 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
continue; continue;
} }
Course.CourseBuilder courseBuilder = Course.builder();
courseBuilder.name(courseName); courseBuilder.name(courseName);
// 直接获取格子里所有课程的关键字段,每个下表对应格子里相应的课程 // 直接获取格子里所有课程的关键字段,每个下表对应格子里相应的课程
@ -72,7 +71,7 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
courseBuilder.teachClass(JsoupUtil.getElementText(classElements)); courseBuilder.teachClass(JsoupUtil.getElementText(classElements));
courseBuilder.teacher(JsoupUtil.getElementText(teacherElements)); courseBuilder.teacher(JsoupUtil.getElementText(teacherElements));
ClassRoom classRoom = new ClassRoom(); Classroom classRoom = new Classroom();
classRoom.setRoom(JsoupUtil.getElementText(classroomElements)); classRoom.setRoom(JsoupUtil.getElementText(classroomElements));
courseBuilder.classroom(classRoom); courseBuilder.classroom(classRoom);
@ -86,29 +85,21 @@ public class UndergradCourseTableParser implements Parser<List<Course>> {
courseBuilder.startSection(lineIndex * 2 + 1); courseBuilder.startSection(lineIndex * 2 + 1);
courseBuilder.endSection(lineIndex * 2 + 2); courseBuilder.endSection(lineIndex * 2 + 2);
// 提取周次信息,可能会有用","分成两段的周次信息文本 // 不直接使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,效率不太行
// 去除后面不需要的节次信息,以免对正则提取产生影响 String timeText = timeElements.isEmpty() ? "" : StringUtil.split(timeElements.get(0).text(), ',').get(0);
// 这样做理论上有点浪费性能了,但还行 List<String> times = StringUtil.split(timeText, ',');
String timeText = timeElements.isEmpty() ? "" : timeElements.get(0).text().split("\\[")[0];
String[] times = timeText.split(",");
for (String time : times) { for (String time : times) {
int startWeek = 0; Matcher weekMatcher = WEEK_REGEX.matcher(time);
int endWeek = 0; // 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming
if (!weekMatcher.find()) {
Matcher matcher = WEEK_RANGE_REGEX.matcher(time); continue;
if (matcher.find()) {
startWeek = Integer.parseInt(matcher.group("startWeek"));
endWeek = Integer.parseInt(matcher.group("endWeek"));
} else {
// 普通匹配不到的话多半就是只有一周的课程
matcher = SINGLE_WEEK_REGEX.matcher(time);
if (matcher.find()) {
startWeek = Integer.parseInt(matcher.group("week"));
endWeek = Integer.parseInt(matcher.group("week"));
}
} }
// 第二次matcher.find()匹配结束周,如果没有数字匹配说明是单周课程
int startWeek = Integer.parseInt(weekMatcher.group());
int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek;
courseBuilder.startWeek(startWeek).endWeek(endWeek); courseBuilder.startWeek(startWeek).endWeek(endWeek);
courses.add(courseBuilder.build()); courses.add(courseBuilder.build());
} }
} }

@ -0,0 +1,146 @@
package cn.linghang.mywust.core.parser.undergraduate.global;
import cn.linghang.mywust.core.exception.ParseException;
import cn.linghang.mywust.model.global.Classroom;
import cn.linghang.mywust.model.global.Course;
import cn.linghang.mywust.util.StringUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
abstract class GlobalCourseTableParser {
private static final Pattern weekPattern = Pattern.compile("\\d+");
protected List<Course> parse(String html, String xpath, Pattern infoPattern) throws ParseException {
// 直接一步到位,拿到所有的大格子
Elements bigGirds = Jsoup.parse(html).selectXpath(xpath);
System.out.println("Girds: " + bigGirds.size());
if (bigGirds.isEmpty()) {
return new ArrayList<>();
}
List<Course> courses = new ArrayList<>(bigGirds.size());
int girdCount = 0;
for (Element bigGird : bigGirds) {
// 处理完一行,计数器置零,方便后续的计算
if (girdCount == 42) {
girdCount = 0;
}
// 大格子是空的,没课,跳过就好了
if (bigGird.text().length() == 0) {
girdCount++;
continue;
}
// 拿到格子里面所有的课
Elements courseGirds = bigGird.getElementsByTag("div");
// 最左边的格子是表的首列,不是课程,拿到的集合是空的,直接跳过,不要计数,以免影响计算节次和星期信息
if (courseGirds.isEmpty()) {
continue;
}
// 通过计数器计算上课星期和节次信息
int weekDay = girdCount / 6 + 1;
int startSection = (girdCount % 6) * 2 + 1;
Course.CourseBuilder courseBuilder = Course.builder();
courseBuilder.weekDay(weekDay);
courseBuilder.startSection(startSection);
courseBuilder.endSection(startSection + 1);
// 解析格子里的课
for (Element courseGird : courseGirds) {
Matcher courseInfoMatcher = infoPattern.matcher(courseGird.ownText());
if (!courseInfoMatcher.find()) {
continue;
}
courseBuilder.name(courseInfoMatcher.group("name"));
courseBuilder.teacher(courseInfoMatcher.group("teacher"));
courseBuilder.teachClass(courseInfoMatcher.group("teachClass"));
courseBuilder.classroom(this.parseClassroom(courseInfoMatcher.group("building")));
// 解析周次,不使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,这里需要执行次数较多,效率不太行
List<String> weeks = StringUtil.split(courseInfoMatcher.group("weekString"), ',');
for (String week : weeks) {
Matcher weekMatcher = weekPattern.matcher(week);
// 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming
if (!weekMatcher.find()) {
continue;
}
int startWeek = Integer.parseInt(weekMatcher.group());
// 再执行一次matcher.find(),如果没有数字匹配说明是单周课程
int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek;
courseBuilder.startWeek(startWeek);
courseBuilder.endWeek(endWeek);
courses.add(courseBuilder.build());
}
}
// 别忘了这里还有个计数器
girdCount++;
}
return courses;
}
// 搬运修改自老项目
private static final Pattern BUILDING_ROOM_INTERNATION_PATTERN = Pattern.compile("(?<building>.*楼)(?<area>.*区)(?<room>\\d+)\\((国际专用)\\)"); // xx楼x区xxx(国际专用)
private static final Pattern BUILDING_ROOM_CAMPUS_PATTERN = Pattern.compile("(?<building>.*楼)(?<room>\\d+)\\((?<campus>.*)\\)"); // xx楼xxx(黄家湖)
private static final Pattern BUILDING_AREA_ROOM_PATTERN = Pattern.compile("(?<building>.*楼)(?<area>.*区)(?<room>\\d+)"); // xx楼x区xxx
private static final Pattern BUILDING_COLLEGE_ROOM_PATTERN = Pattern.compile("(?<building>.*楼)\\((.*)\\)(?<room>\\d+)"); //xx楼(xxx)xxx
private static final Pattern BUILDING_ROOM = Pattern.compile("(?<building>.*楼)(?<room>\\d+)"); // xx楼xxx
/**
* 解析上课地点
*
* @param placeName 上课地点名称
* @return 解析后的上课地点
*/
protected Classroom parseClassroom(String placeName) {
// 用正则一个一个格式匹配,看起来有点不太聪明的样子,但是目前并未想到更好的办法...
// 下面的顺序是按照频率排好的,概率越高,就放在更前面,减少正则匹配的次数
Matcher matcher = BUILDING_AREA_ROOM_PATTERN.matcher(placeName);
if (matcher.find()) {
return new Classroom("", matcher.group("building"), matcher.group("area"), matcher.group("room"));
}
matcher = BUILDING_ROOM.matcher(placeName);
if (matcher.find()) {
return new Classroom("", matcher.group("building"), "", matcher.group("room"));
}
matcher = BUILDING_ROOM_CAMPUS_PATTERN.matcher(placeName);
if (matcher.find()) {
return new Classroom(matcher.group("campus"), matcher.group("building"), "", matcher.group("room"));
}
matcher = BUILDING_COLLEGE_ROOM_PATTERN.matcher(placeName);
if (matcher.find()) {
return new Classroom("", matcher.group("building"), "", matcher.group("room"));
}
matcher = BUILDING_ROOM_INTERNATION_PATTERN.matcher(placeName);
if (matcher.find()) {
return new Classroom("", matcher.group("building"), matcher.group("area"), matcher.group("room"));
}
return new Classroom("", "", "", placeName);
}
}

@ -0,0 +1,22 @@
package cn.linghang.mywust.core.parser.undergraduate.global;
import cn.linghang.mywust.core.exception.ParseException;
import cn.linghang.mywust.core.parser.Parser;
import cn.linghang.mywust.model.global.Course;
import java.util.List;
import java.util.regex.Pattern;
public class UndergradAllCourseScheduleParser extends GlobalCourseTableParser implements Parser<List<Course>> {
private static final String courseGirdsXPath = "//*[@id=\"kbtable\"]/tbody/tr/td";
// name组的如果使用.*?匹配而不是.*贪婪匹配的话虽然可以大大减少匹配次数,但是对于课程名中有空格的课程,可能会导致解析错误,
// 不过对于没有老师且课程名中含有空格的课程(都是一些在线课程),虽然能匹配,但是字段会有偏差错误,不过影响不大就是了,在线课程不用太过于纠结
private static final Pattern pattern = Pattern.compile("(?<name>.*) (?<teachClass>.*?) (?<teacher>.*?) \\((?<weekString>.*?)周\\) ?(?<building>.*)");
@Override
public List<Course> parse(String html) throws ParseException {
return super.parse(html, courseGirdsXPath, pattern);
}
}

@ -0,0 +1,22 @@
package cn.linghang.mywust.core.parser.undergraduate.global;
import cn.linghang.mywust.core.exception.ParseException;
import cn.linghang.mywust.core.parser.Parser;
import cn.linghang.mywust.model.global.Course;
import java.util.List;
import java.util.regex.Pattern;
public class UndergradTeacherCourseParser extends GlobalCourseTableParser implements Parser<List<Course>> {
private static final String courseGirdsXPath = "//*[@id=\"kbtable\"]/tbody/tr/td";
// 还有教学班名称格式为20xx寒假课堂xx的线上课和教学班类似于“国贸[1901-1903]班,国贸1901(香涛)班”的这种较为复杂的教学班字段是匹配不到的,但是数量极少,大约只有几个,可以忽略不计
// 其实主要是教学班和课程名称和在一起了,不好分离,这里解析出来的字段有些是不怎么准确的,推荐使用课程课表来查询(但是得到的课貌似比这里的少(可能是线上课?))
private static final Pattern pattern = Pattern.compile("(?<name>.*?[^ ]) ?(?<teachClass>教学班\\d+|\\d+班|临班\\d+|\\[\\d+-\\d+]班) (?<teacher>.*?) \\((?<weekString>.*?)周\\) ?(?<building>.*)");
@Override
public List<Course> parse(String html) throws ParseException {
return super.parse(html, courseGirdsXPath, pattern);
}
}

@ -43,7 +43,7 @@ public class BkjxAllCourseRequestFactory extends BkjxRequestFactory {
.add("jc1", "") .add("jc1", "")
.add("jc2", ""); .add("jc2", "");
return makeStringDataHttpRequest(UndergradUrls.BKJX_CLASSROOM_COURSE_API, formBodyBuilder.buildAndToString(), cookies); return makeStringDataHttpRequest(UndergradUrls.BKJX_TEACHER_COURSE_API, formBodyBuilder.buildAndToString(), cookies);
} }
public static HttpRequest allCourseSchedulePageRequest(String cookies, String term, String timeMode, String subCollegeId, String courseName) { public static HttpRequest allCourseSchedulePageRequest(String cookies, String term, String timeMode, String subCollegeId, String courseName) {

@ -9,7 +9,7 @@ import lombok.NoArgsConstructor;
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
@NoArgsConstructor @NoArgsConstructor
public class ClassRoom { public class Classroom {
/** /**
* 校区黄家湖或青山 * 校区黄家湖或青山
*/ */

@ -7,6 +7,7 @@ import lombok.NoArgsConstructor;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Objects;
@Data @Data
@Builder @Builder
@ -53,7 +54,7 @@ public class Course {
*/ */
private int endSection; private int endSection;
private ClassRoom classroom; private Classroom classroom;
private static final Map<String, Integer> WEEKDAY_MAP = makeWeekdayMap(); private static final Map<String, Integer> WEEKDAY_MAP = makeWeekdayMap();
@ -69,6 +70,19 @@ public class Course {
this.weekDay = weekDay; this.weekDay = weekDay;
} }
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Course course = (Course) o;
return startWeek == course.startWeek && endWeek == course.endWeek && weekDay == course.weekDay && startSection == course.startSection && endSection == course.endSection && Objects.equals(teachClass, course.teachClass);
}
@Override
public int hashCode() {
return Objects.hash(teachClass, startWeek, endWeek, weekDay, startSection, endSection);
}
private static Map<String, Integer> makeWeekdayMap() { private static Map<String, Integer> makeWeekdayMap() {
HashMap<String, Integer> map = new HashMap<>(7 + 2); HashMap<String, Integer> map = new HashMap<>(7 + 2);

@ -54,6 +54,7 @@ public class StringUtil {
/** /**
* 生成参数签名 * 生成参数签名
*
* @param appId appId * @param appId appId
* @param secretKey secretKey * @param secretKey secretKey
* @return 生成得到的签名sign字段 * @return 生成得到的签名sign字段
@ -97,4 +98,22 @@ public class StringUtil {
// 一般八月到第二年二月算是是秋季期 // 一般八月到第二年二月算是是秋季期
return getTermString(now, month >= 8 || month < 2); return getTermString(now, month >= 8 || month < 2);
} }
static public List<String> split(String source, char gap) {
List<String> result = new ArrayList<>(4);
char[] sourceChars = source.toCharArray();
int startIndex = 0;
for (int index = -1; ++index != sourceChars.length; ) {
if (sourceChars[index] != gap) {
continue;
}
result.add(source.substring(startIndex, index));
startIndex = index + 1;
}
result.add(source.substring(startIndex, sourceChars.length));
return result;
}
} }

Loading…
Cancel
Save