|
|
|
@ -24,8 +24,21 @@ public class UndergradCourseTableParser implements Parser<List<Course>> { |
|
|
|
|
|
|
|
|
|
private static final String COURSE_SPLIT_TAG_STR = "</div><div>"; |
|
|
|
|
|
|
|
|
|
private static final Pattern WEEK_REGEX = Pattern.compile("\\d+"); |
|
|
|
|
|
|
|
|
|
// 用来匹配数字的,位数不限
|
|
|
|
|
private static final Pattern DIGITAL_PATTERN = Pattern.compile("\\d+"); |
|
|
|
|
|
|
|
|
|
// 例:1-17(周)[03-04节]; 1-2,4-7(周)[01-02节]
|
|
|
|
|
// 容易看一点的:(?<week>.*?)(周)[(?<section>.*?)节]
|
|
|
|
|
// 提取出来后:week: 1-17, section: 03-04; week: 1-2,4-7, section: 01-02
|
|
|
|
|
private static final Pattern WEEK_SECTION_REGEX = Pattern.compile("(?<week>.*?)\\(周\\)\\[(?<section>.*?)节]"); |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* 解析课程,可能会有重复的课程,调用者需要手动去重 |
|
|
|
|
* |
|
|
|
|
* @param html 原页面html |
|
|
|
|
* @return 解析好的课程List |
|
|
|
|
* @throws ParseException 解析课表时出现任何问题 |
|
|
|
|
*/ |
|
|
|
|
@Override |
|
|
|
|
public List<Course> parse(String html) throws ParseException { |
|
|
|
|
try { |
|
|
|
@ -40,12 +53,12 @@ public class UndergradCourseTableParser implements Parser<List<Course>> { |
|
|
|
|
|
|
|
|
|
List<Course> courses = new ArrayList<>(girds.size()); |
|
|
|
|
|
|
|
|
|
// 遍历每个格子,使用girdCount计数格子来计算节次信息
|
|
|
|
|
// 遍历每个格子,使用girdCount计数格子来计算星期
|
|
|
|
|
int girdCount = 0; |
|
|
|
|
for (Element gird : girds) { |
|
|
|
|
girdCount++; |
|
|
|
|
|
|
|
|
|
// 将分隔符替换成标签,方便解析
|
|
|
|
|
// 将分隔符替换成标签,方便重新解析格子
|
|
|
|
|
String girdHtml = gird.outerHtml().replace(COURSE_SPLIT_STR, COURSE_SPLIT_TAG_STR); |
|
|
|
|
Elements courseElements = Jsoup.parse(girdHtml).getElementsByTag("div"); |
|
|
|
|
for (Element courseElement : courseElements) { |
|
|
|
@ -62,7 +75,7 @@ public class UndergradCourseTableParser implements Parser<List<Course>> { |
|
|
|
|
|
|
|
|
|
courseBuilder.name(courseName); |
|
|
|
|
|
|
|
|
|
// 直接获取格子里所有课程的关键字段,每个下表对应格子里相应的课程
|
|
|
|
|
// 直接获取格子里的关键信息
|
|
|
|
|
Elements classElements = courseElement.getElementsByAttributeValue("title", "课堂名称"); |
|
|
|
|
Elements teacherElements = courseElement.getElementsByAttributeValue("title", "老师"); |
|
|
|
|
Elements timeElements = courseElement.getElementsByAttributeValue("title", "周次(节次)"); |
|
|
|
@ -78,31 +91,8 @@ public class UndergradCourseTableParser implements Parser<List<Course>> { |
|
|
|
|
int weekDay = girdCount % 7; |
|
|
|
|
courseBuilder.weekDay(weekDay == 0 ? 7 : weekDay); |
|
|
|
|
|
|
|
|
|
// 靠行位置来确定节次和星期,而不是靠time字段的数据确定(因为太不好处理了)
|
|
|
|
|
// 对于只有一个小节的课程,这类课程多数是在线课程,这里一律按照两小节大课处理
|
|
|
|
|
// 具体算法就是行索引x2 + 1就是开始的节次(索引从0开始)
|
|
|
|
|
int lineIndex = (girdCount / 7); |
|
|
|
|
courseBuilder.startSection(lineIndex * 2 + 1); |
|
|
|
|
courseBuilder.endSection(lineIndex * 2 + 2); |
|
|
|
|
|
|
|
|
|
// 切割连续周信息,如"1-2,4-6(周)"这样两段的连续周(1-3周和5-10周)
|
|
|
|
|
// 不直接使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,效率不太行
|
|
|
|
|
String timeText = StringUtil.split(JsoupUtil.getElementText(timeElements, 0), ',').get(0); |
|
|
|
|
List<String> times = StringUtil.split(timeText, ','); |
|
|
|
|
for (String time : times) { |
|
|
|
|
Matcher weekMatcher = WEEK_REGEX.matcher(time); |
|
|
|
|
// 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming
|
|
|
|
|
if (!weekMatcher.find()) { |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 第二次matcher.find()匹配结束周,如果没有数字匹配说明是单周课程
|
|
|
|
|
int startWeek = Integer.parseInt(weekMatcher.group()); |
|
|
|
|
int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek; |
|
|
|
|
courseBuilder.startWeek(startWeek).endWeek(endWeek); |
|
|
|
|
|
|
|
|
|
courses.add(courseBuilder.build()); |
|
|
|
|
} |
|
|
|
|
String timeText = JsoupUtil.getElementText(timeElements, 0); |
|
|
|
|
this.parseTime(timeText, courseBuilder, courses); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -113,4 +103,52 @@ public class UndergradCourseTableParser implements Parser<List<Course>> { |
|
|
|
|
throw new ParseException("解析课表时出现问题:" + e, html); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
/** |
|
|
|
|
* 解析周次和节次时间,主要使用正则解析 |
|
|
|
|
* |
|
|
|
|
* @param timeText 周次节次文本,如:1-17(周)[03-04节]; 1-2,4-7(周)[01-02节] |
|
|
|
|
* @param courseBuilder courseBuilder |
|
|
|
|
* @param courses 课程解析结果列表 |
|
|
|
|
*/ |
|
|
|
|
private void parseTime(String timeText, Course.CourseBuilder courseBuilder, List<Course> courses) { |
|
|
|
|
Matcher timeMatcher = WEEK_SECTION_REGEX.matcher(timeText); |
|
|
|
|
if (timeMatcher.find()) { |
|
|
|
|
// 解析节次,这种方法相比于通过定位格子来确定节次更准确,但是可能会出现重复的课程
|
|
|
|
|
String sectionString = timeMatcher.group("section"); |
|
|
|
|
Matcher sectionMatcher = DIGITAL_PATTERN.matcher(sectionString); |
|
|
|
|
if (sectionMatcher.find()) { |
|
|
|
|
int startSection = Integer.parseInt(sectionMatcher.group()); |
|
|
|
|
|
|
|
|
|
// 不断匹配下一个数字,直到最后一个,即为结束节次数字,如果第一次就不匹配,则为单节课
|
|
|
|
|
int endSection = startSection; |
|
|
|
|
while (sectionMatcher.find()) { |
|
|
|
|
endSection = Integer.parseInt(sectionMatcher.group()); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
courseBuilder.startSection(startSection); |
|
|
|
|
courseBuilder.endSection(endSection); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
String weekString = timeMatcher.group("week"); |
|
|
|
|
|
|
|
|
|
// 切割连续周信息,如"1-2,4-6(周)"这样两段的连续周(1-3周和5-10周)
|
|
|
|
|
// 不直接使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,效率不太行,但是有一说一,其实可以忽略
|
|
|
|
|
List<String> weekTexts = StringUtil.split(weekString, ','); |
|
|
|
|
for (String weekText : weekTexts) { |
|
|
|
|
Matcher weekMatcher = DIGITAL_PATTERN.matcher(weekText); |
|
|
|
|
// 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming
|
|
|
|
|
if (!weekMatcher.find()) { |
|
|
|
|
continue; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// 第二次matcher.find()匹配结束周,如果没有数字匹配说明是单周课程
|
|
|
|
|
int startWeek = Integer.parseInt(weekMatcher.group()); |
|
|
|
|
int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek; |
|
|
|
|
courseBuilder.startWeek(startWeek).endWeek(endWeek); |
|
|
|
|
|
|
|
|
|
courses.add(courseBuilder.build()); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|