Files
NJUPT-Suan-API/njupt_api/zhengfang/createcourse.py
MangoFanFanw b284c3c260 Python 后端提交
Python 后端(FastAPI + FastMCP + ...)的初始版本号设定为 0.1.0,这是 uv 在 pypriject.toml
里给我自动设置的,我觉得有道理。
2026-04-21 13:38:46 +08:00

316 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
from bs4 import BeautifulSoup
from .types import Course
def normalize_course_str(course_str: str) -> str:
"""
规范化课程字符串,确保 create_course 能正确解析。
Returns:
字符串。
"""
parts = course_str.split("<br>")
while parts and parts[0] == "":
parts.pop(0)
while len(parts) < 4:
parts.append(" ")
for i in range(2, 4):
if parts[i] == "":
parts[i] = " "
return "<br>".join(parts)
def create_course_schedule(html: str) -> list[Course]:
"""解析给定 HTML 字符串,返回包含数个 Course 对象的列表。
Args:
html: HTML 字符串。应该有且只有一个 <table> 标签,其中是课程表数据。
Returns:
list[Course]
"""
soup = BeautifulSoup(html, "html.parser")
table = soup.find("table")
rows = table.find_all("tr")
courses: list[Course] = []
rowspan_map: dict[int, int] = {}
# 解析第一行表头,建立列索引到星期几的映射
# 表头格式第1列是"时间"(colspan=2),然后是 星期一 到 星期日
day_map: dict[int, int] = {} # col_idx -> day (1-7)
if rows:
header_cells = rows[0].find_all(["td", "th"])
col_idx = 0
for cell in header_cells:
text = cell.get_text(strip=True)
colspan = int(cell.get("colspan", 1))
# 跳过"时间"单元格
if text != "时间":
# 映射星期几到数字
day_mapping = {
"星期一": 1,
"星期二": 2,
"星期三": 3,
"星期四": 4,
"星期五": 5,
"星期六": 6,
"星期日": 7,
"星期天": 7,
}
day = day_mapping.get(text)
if day is not None:
for c in range(col_idx, col_idx + colspan):
day_map[c] = day
col_idx += colspan
for row_idx, row in enumerate(rows):
if row_idx == 0:
continue
cells = row.find_all(["td", "th"])
col_idx = 0
class_start: int | None = None
for cell in cells:
while col_idx in rowspan_map and rowspan_map[col_idx] > 0:
rowspan_map[col_idx] -= 1
if rowspan_map[col_idx] == 0:
del rowspan_map[col_idx]
col_idx += 1
text = cell.get_text(strip=True)
colspan = int(cell.get("colspan", 1))
rowspan = int(cell.get("rowspan", 1))
if text.startswith("") and text.endswith(""):
class_start = int(text[1:-1])
if rowspan > 1:
for c in range(col_idx, col_idx + colspan):
rowspan_map[c] = rowspan - 1
col_idx += colspan
continue
if text in ("早晨", "上午", "下午", "晚上"):
if rowspan > 1:
for c in range(col_idx, col_idx + colspan):
rowspan_map[c] = rowspan - 1
col_idx += colspan
continue
td_str = str(cell)
start = td_str.find(">") + 1
end = td_str.rfind("</td>")
inner_html = td_str[start:end]
if "&nbsp;" not in inner_html and inner_html.strip():
inner_html = re.sub(r"<br\s*/?>", "<br>", inner_html)
course_strs = [
s.strip() for s in re.split(r"(?:<br>){2,}", inner_html) if s.strip() and "&nbsp;" not in s
]
# 获取当前列对应的星期几
day = day_map.get(col_idx, 1) # 默认为1星期一
for course_str in course_strs:
course_str = normalize_course_str(course_str)
courses.append(
create_course(
course_str,
day,
default_classes_start=class_start,
),
)
if rowspan > 1:
for c in range(col_idx, col_idx + colspan):
rowspan_map[c] = rowspan - 1
col_idx += colspan
return courses
def create_course(
raw: str,
day: int,
default_classes_start: int | None = None,
) -> Course:
"""根据从 HTML 中提取出的原字符串解析课程信息
Args:
raw: 原字符串,以 <br> 作为换行符
day: 周内的星期几
default_classes_start: 如果没有解析出课程的 classes则使用此参数。
此参数应当从表格的行标题解析。
Returns:
Course
"""
# 0 1 2 3 4
# ['概率论与数理统计', '1-17单(1,2)', '王雪红', '教3-520', '']
raw_list = raw.split("<br>")
# 首先去除列表头部的所有空字符串
while True:
if raw_list[0] == "":
raw_list.pop(0)
else:
break
# 对于大部分课程raw_list[1] 都是形如以下格式
# 1-17(3,4)
# 1-17单(1,2) *(也可能是双)
# 2节/周
# 2节/单周 *(也可能是双)
# 周三第3,4节{第1-17周}
# 周五第3,4节{第2-16周|双周}
raw_time = raw_list[1]
weeks = []
classes = []
single = False # 内部变量
double = False # 内部变量
# 处理前两种形式
if "-" in raw_time and "" not in raw_time:
# 也可能是 '1-17单'
t = raw_time.split("(") # ['1-17', '3-4)']
# 也可能是 '17单'
start, end = t[0].split("-") # ['1', '17']
if end.endswith(""):
end = end[:-1]
single = True
elif end.endswith(""):
end = end[:-1]
double = True
for i in range(int(start), int(end) + 1):
if single and i % 2 == 0:
continue
if double and i % 2 == 1:
continue
weeks.append(i)
raw_classes = t[1].removesuffix(")")
classes = [int(i) for i in raw_classes.split(",")]
# 处理中两种形式
elif "/" in raw_time:
# 默认学期 1-16 周
if "/单周" in raw_time:
single = True
elif "/双周" in raw_time:
double = True
for i in range(1, 17):
if single and i % 2 == 0:
continue
if double and i % 2 == 1:
continue
weeks.append(i)
# 获取多少节课
t_num = int(raw_time.split("")[0])
for i in range(0, t_num):
classes.append(default_classes_start + i)
# 处理后两种形式
elif "" in raw_time:
# '周三', '3,4节{', '1-17周}'
# '周五', '3,4节{', '2-16周|双周}'
u = raw_time.split("")
classes = [int(u_c) for u_c in u[1].split("")[0].split(",")]
# '1-17', '}'
# '2-16', '|双', '}'
u_w = u[2].split("")
if "" in u_w[1]:
single = True
elif "" in u_w[1]:
double = True
u_start, u_end = u_w[0].split("-")
for i in range(int(u_start), int(u_end) + 1):
if single and i % 2 == 0:
continue
if double and i % 2 == 1:
continue
weeks.append(i)
teacher = raw_list[2] if raw_list[2] != " " else None
classroom = raw_list[3] if raw_list[3] != " " else None
return Course(raw_list[0], weeks, day, classes, teacher, classroom)
def convert_dict_schedule_to_tuple(schedule: list[dict]) -> list[tuple]:
"""将字典格式的课表转换为压缩的元组格式。
Args:
schedule: list[dict],标准格式的课程数据
Returns:
list[tuple]: 压缩后的元组格式 (name, teacher, classroom, weeks_str, day, classes)
其中 weeks 尽量压缩为字符串格式(如 "1-17"
"""
result = []
for course in schedule:
name = course.get("name", "")
teacher = course.get("teacher")
classroom = course.get("classroom")
weeks = course.get("weeks", [])
day = course.get("day", 1)
classes = course.get("classes", [])
# 压缩 weeks 为字符串
weeks_str = compress_weeks_to_string(weeks) if weeks else ""
result.append((name, teacher, classroom, weeks_str, day, classes))
return result
def compress_weeks_to_string(weeks: list[int]) -> str:
"""将周数列表压缩为最短的字符串表示。
例如:
[1,2,3,4,5] -> "1-5"
[1,3,5,7] -> "1,3,5,7"
[1,2,3,5,6,7,8] -> "1-3,5-8"
[1] -> "1"
Args:
weeks: 周数列表
Returns:
str: 压缩后的周数字符串
"""
if not weeks:
return ""
# 去重并排序
weeks = sorted({int(w) for w in weeks})
ranges = []
start = end = weeks[0]
for w in weeks[1:]:
if w == end + 1:
# 连续,扩展当前范围
end = w
else:
# 不连续,保存当前范围,开始新范围
ranges.append((start, end))
start = end = w
# 保存最后一个范围
ranges.append((start, end))
# 格式化为字符串
parts = []
for start, end in ranges:
if start == end:
parts.append(str(start))
else:
parts.append(f"{start}-{end}")
return ",".join(parts)