parent
29e482943b
commit
0822796201
@ -0,0 +1,146 @@ |
|||||||
|
package cn.linghang.mywust.core.parser.undergraduate.global; |
||||||
|
|
||||||
|
import cn.linghang.mywust.core.exception.ParseException; |
||||||
|
import cn.linghang.mywust.model.global.Classroom; |
||||||
|
import cn.linghang.mywust.model.global.Course; |
||||||
|
import cn.linghang.mywust.util.StringUtil; |
||||||
|
import org.jsoup.Jsoup; |
||||||
|
import org.jsoup.nodes.Element; |
||||||
|
import org.jsoup.select.Elements; |
||||||
|
|
||||||
|
import java.util.ArrayList; |
||||||
|
import java.util.List; |
||||||
|
import java.util.regex.Matcher; |
||||||
|
import java.util.regex.Pattern; |
||||||
|
|
||||||
|
abstract class GlobalCourseTableParser { |
||||||
|
|
||||||
|
private static final Pattern weekPattern = Pattern.compile("\\d+"); |
||||||
|
|
||||||
|
protected List<Course> parse(String html, String xpath, Pattern infoPattern) throws ParseException { |
||||||
|
// 直接一步到位,拿到所有的大格子
|
||||||
|
Elements bigGirds = Jsoup.parse(html).selectXpath(xpath); |
||||||
|
System.out.println("Girds: " + bigGirds.size()); |
||||||
|
if (bigGirds.isEmpty()) { |
||||||
|
return new ArrayList<>(); |
||||||
|
} |
||||||
|
|
||||||
|
List<Course> courses = new ArrayList<>(bigGirds.size()); |
||||||
|
|
||||||
|
int girdCount = 0; |
||||||
|
for (Element bigGird : bigGirds) { |
||||||
|
// 处理完一行,计数器置零,方便后续的计算
|
||||||
|
if (girdCount == 42) { |
||||||
|
girdCount = 0; |
||||||
|
} |
||||||
|
|
||||||
|
// 大格子是空的,没课,跳过就好了
|
||||||
|
if (bigGird.text().length() == 0) { |
||||||
|
girdCount++; |
||||||
|
continue; |
||||||
|
} |
||||||
|
|
||||||
|
// 拿到格子里面所有的课
|
||||||
|
Elements courseGirds = bigGird.getElementsByTag("div"); |
||||||
|
|
||||||
|
// 最左边的格子是表的首列,不是课程,拿到的集合是空的,直接跳过,不要计数,以免影响计算节次和星期信息
|
||||||
|
if (courseGirds.isEmpty()) { |
||||||
|
continue; |
||||||
|
} |
||||||
|
|
||||||
|
// 通过计数器计算上课星期和节次信息
|
||||||
|
int weekDay = girdCount / 6 + 1; |
||||||
|
int startSection = (girdCount % 6) * 2 + 1; |
||||||
|
|
||||||
|
Course.CourseBuilder courseBuilder = Course.builder(); |
||||||
|
courseBuilder.weekDay(weekDay); |
||||||
|
courseBuilder.startSection(startSection); |
||||||
|
courseBuilder.endSection(startSection + 1); |
||||||
|
|
||||||
|
// 解析格子里的课
|
||||||
|
for (Element courseGird : courseGirds) { |
||||||
|
Matcher courseInfoMatcher = infoPattern.matcher(courseGird.ownText()); |
||||||
|
if (!courseInfoMatcher.find()) { |
||||||
|
continue; |
||||||
|
} |
||||||
|
|
||||||
|
courseBuilder.name(courseInfoMatcher.group("name")); |
||||||
|
courseBuilder.teacher(courseInfoMatcher.group("teacher")); |
||||||
|
courseBuilder.teachClass(courseInfoMatcher.group("teachClass")); |
||||||
|
courseBuilder.classroom(this.parseClassroom(courseInfoMatcher.group("building"))); |
||||||
|
|
||||||
|
// 解析周次,不使用String.split而是手动分割,是因为系统自带split方法每次调用都需要编译一次切割正则,这里需要执行次数较多,效率不太行
|
||||||
|
List<String> weeks = StringUtil.split(courseInfoMatcher.group("weekString"), ','); |
||||||
|
for (String week : weeks) { |
||||||
|
Matcher weekMatcher = weekPattern.matcher(week); |
||||||
|
// 周次信息不是数字,这种情况尚未出现过,这里的if判断只是用于消除warming
|
||||||
|
if (!weekMatcher.find()) { |
||||||
|
continue; |
||||||
|
} |
||||||
|
|
||||||
|
int startWeek = Integer.parseInt(weekMatcher.group()); |
||||||
|
// 再执行一次matcher.find(),如果没有数字匹配说明是单周课程
|
||||||
|
int endWeek = weekMatcher.find() ? Integer.parseInt(weekMatcher.group()) : startWeek; |
||||||
|
|
||||||
|
courseBuilder.startWeek(startWeek); |
||||||
|
courseBuilder.endWeek(endWeek); |
||||||
|
|
||||||
|
courses.add(courseBuilder.build()); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// 别忘了这里还有个计数器
|
||||||
|
girdCount++; |
||||||
|
} |
||||||
|
|
||||||
|
return courses; |
||||||
|
} |
||||||
|
|
||||||
|
// 搬运修改自老项目
|
||||||
|
private static final Pattern BUILDING_ROOM_INTERNATION_PATTERN = Pattern.compile("(?<building>.*楼)(?<area>.*区)(?<room>\\d+)\\((国际专用)\\)"); // xx楼x区xxx(国际专用)
|
||||||
|
|
||||||
|
private static final Pattern BUILDING_ROOM_CAMPUS_PATTERN = Pattern.compile("(?<building>.*楼)(?<room>\\d+)\\((?<campus>.*)\\)"); // xx楼xxx(黄家湖)
|
||||||
|
|
||||||
|
private static final Pattern BUILDING_AREA_ROOM_PATTERN = Pattern.compile("(?<building>.*楼)(?<area>.*区)(?<room>\\d+)"); // xx楼x区xxx
|
||||||
|
|
||||||
|
private static final Pattern BUILDING_COLLEGE_ROOM_PATTERN = Pattern.compile("(?<building>.*楼)\\((.*)\\)(?<room>\\d+)"); //xx楼(xxx)xxx
|
||||||
|
|
||||||
|
private static final Pattern BUILDING_ROOM = Pattern.compile("(?<building>.*楼)(?<room>\\d+)"); // xx楼xxx
|
||||||
|
|
||||||
|
/** |
||||||
|
* 解析上课地点 |
||||||
|
* |
||||||
|
* @param placeName 上课地点名称 |
||||||
|
* @return 解析后的上课地点 |
||||||
|
*/ |
||||||
|
protected Classroom parseClassroom(String placeName) { |
||||||
|
// 用正则一个一个格式匹配,看起来有点不太聪明的样子,但是目前并未想到更好的办法...
|
||||||
|
// 下面的顺序是按照频率排好的,概率越高,就放在更前面,减少正则匹配的次数
|
||||||
|
Matcher matcher = BUILDING_AREA_ROOM_PATTERN.matcher(placeName); |
||||||
|
if (matcher.find()) { |
||||||
|
return new Classroom("", matcher.group("building"), matcher.group("area"), matcher.group("room")); |
||||||
|
} |
||||||
|
|
||||||
|
matcher = BUILDING_ROOM.matcher(placeName); |
||||||
|
if (matcher.find()) { |
||||||
|
return new Classroom("", matcher.group("building"), "", matcher.group("room")); |
||||||
|
} |
||||||
|
|
||||||
|
matcher = BUILDING_ROOM_CAMPUS_PATTERN.matcher(placeName); |
||||||
|
if (matcher.find()) { |
||||||
|
return new Classroom(matcher.group("campus"), matcher.group("building"), "", matcher.group("room")); |
||||||
|
} |
||||||
|
|
||||||
|
matcher = BUILDING_COLLEGE_ROOM_PATTERN.matcher(placeName); |
||||||
|
if (matcher.find()) { |
||||||
|
return new Classroom("", matcher.group("building"), "", matcher.group("room")); |
||||||
|
} |
||||||
|
|
||||||
|
matcher = BUILDING_ROOM_INTERNATION_PATTERN.matcher(placeName); |
||||||
|
if (matcher.find()) { |
||||||
|
return new Classroom("", matcher.group("building"), matcher.group("area"), matcher.group("room")); |
||||||
|
} |
||||||
|
|
||||||
|
return new Classroom("", "", "", placeName); |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,22 @@ |
|||||||
|
package cn.linghang.mywust.core.parser.undergraduate.global; |
||||||
|
|
||||||
|
import cn.linghang.mywust.core.exception.ParseException; |
||||||
|
import cn.linghang.mywust.core.parser.Parser; |
||||||
|
import cn.linghang.mywust.model.global.Course; |
||||||
|
|
||||||
|
import java.util.List; |
||||||
|
import java.util.regex.Pattern; |
||||||
|
|
||||||
|
|
||||||
|
public class UndergradAllCourseScheduleParser extends GlobalCourseTableParser implements Parser<List<Course>> { |
||||||
|
private static final String courseGirdsXPath = "//*[@id=\"kbtable\"]/tbody/tr/td"; |
||||||
|
|
||||||
|
// name组的如果使用.*?匹配而不是.*贪婪匹配的话虽然可以大大减少匹配次数,但是对于课程名中有空格的课程,可能会导致解析错误,
|
||||||
|
// 不过对于没有老师且课程名中含有空格的课程(都是一些在线课程),虽然能匹配,但是字段会有偏差错误,不过影响不大就是了,在线课程不用太过于纠结
|
||||||
|
private static final Pattern pattern = Pattern.compile("(?<name>.*) (?<teachClass>.*?) (?<teacher>.*?) \\((?<weekString>.*?)周\\) ?(?<building>.*)"); |
||||||
|
|
||||||
|
@Override |
||||||
|
public List<Course> parse(String html) throws ParseException { |
||||||
|
return super.parse(html, courseGirdsXPath, pattern); |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,22 @@ |
|||||||
|
package cn.linghang.mywust.core.parser.undergraduate.global; |
||||||
|
|
||||||
|
import cn.linghang.mywust.core.exception.ParseException; |
||||||
|
import cn.linghang.mywust.core.parser.Parser; |
||||||
|
import cn.linghang.mywust.model.global.Course; |
||||||
|
|
||||||
|
import java.util.List; |
||||||
|
import java.util.regex.Pattern; |
||||||
|
|
||||||
|
|
||||||
|
public class UndergradTeacherCourseParser extends GlobalCourseTableParser implements Parser<List<Course>> { |
||||||
|
private static final String courseGirdsXPath = "//*[@id=\"kbtable\"]/tbody/tr/td"; |
||||||
|
|
||||||
|
// 还有教学班名称格式为20xx寒假课堂xx的线上课和教学班类似于“国贸[1901-1903]班,国贸1901(香涛)班”的这种较为复杂的教学班字段是匹配不到的,但是数量极少,大约只有几个,可以忽略不计
|
||||||
|
// 其实主要是教学班和课程名称和在一起了,不好分离,这里解析出来的字段有些是不怎么准确的,推荐使用课程课表来查询(但是得到的课貌似比这里的少(可能是线上课?))
|
||||||
|
private static final Pattern pattern = Pattern.compile("(?<name>.*?[^ ]) ?(?<teachClass>教学班\\d+|\\d+班|临班\\d+|\\[\\d+-\\d+]班) (?<teacher>.*?) \\((?<weekString>.*?)周\\) ?(?<building>.*)"); |
||||||
|
|
||||||
|
@Override |
||||||
|
public List<Course> parse(String html) throws ParseException { |
||||||
|
return super.parse(html, courseGirdsXPath, pattern); |
||||||
|
} |
||||||
|
} |
Loading…
Reference in new issue