2024-09-19 18:00:24 +08:00
|
|
|
import re
|
2024-09-13 16:05:16 +08:00
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
def get_patterns_for_qualification():
|
|
|
|
# begin_pattern 匹配以'资格审查'或'资格性检查'开始的标题,同时允许前面有章节编号和标题
|
|
|
|
begin_pattern = re.compile(
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查|资格性检查).*', re.MULTILINE)
|
2024-09-18 11:57:17 +08:00
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
# end_pattern 匹配下一个章节的开始或以'附件'加数字开头的页
|
|
|
|
end_pattern = re.compile(
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|附件\s*\d+', re.MULTILINE)
|
2024-09-19 11:33:17 +08:00
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
return begin_pattern, end_pattern
|
2024-09-19 11:33:17 +08:00
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
# 获取编译后的正则表达式
|
|
|
|
begin_pattern, end_pattern = get_patterns_for_qualification()
|
2024-09-19 11:33:17 +08:00
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
# 示例字符串,用于测试正则表达式是否工作
|
|
|
|
test_string = """
|
|
|
|
第一章 资格审查
|
|
|
|
资格性检查
|
|
|
|
第二章 实施规范
|
|
|
|
附件 4 使用说明
|
|
|
|
"""
|
2024-09-19 11:33:17 +08:00
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
# 测试 begin_pattern
|
|
|
|
begin_matches = begin_pattern.findall(test_string)
|
|
|
|
print("Begin Matches:", begin_matches)
|
2024-09-19 11:33:17 +08:00
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
# 测试 end_pattern
|
|
|
|
end_matches = end_pattern.findall(test_string)
|
|
|
|
print("End Matches:", end_matches)
|