11.8 截取资格文件bug处理

This commit is contained in:
zy123 2024-11-08 17:09:44 +08:00
parent 1e4949e83b
commit 5c1cc1dd4b

View File

@ -475,26 +475,37 @@ import re
def extract_pages_qualification(pdf_document, begin_page, common_header):
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = re.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)',
re.MULTILINE
)
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
priority_pattern = re.compile(
r'^(资格性检查|资格审查|符合性审查)',
re.MULTILINE
)
end_pattern = re.compile(
r'^(?!.*(?:资格|符合))(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$|' #可能出现附件与符合性审查不在同一行的情况,误判结束位置。
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
# 结束匹配模式 - 附录、附件、附表等
end_pattern_attachment = re.compile(
r'^(?:附录.*?[:]|附件.*?[:]|附表.*?[:]|附件\s*\d+).*$',
re.MULTILINE
)
# 结束匹配模式 - 章节标题
end_pattern_chapter = re.compile(
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
re.MULTILINE
)
print("第二次尝试:匹配附件")
start_page = None
end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查"]
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查"]
exclude_keywords = ["声明函", "承诺函"]
# 从章节开始后的位置进行检查
# 从指定的开始页开始遍历
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
text = page.extract_text()
if text:
@ -505,6 +516,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
if priority_match and start_page is None:
start_page = i
# print(f"匹配到优先模式,设置起始页为: {start_page}")
continue
else:
# 如果未匹配优先模式,则按照一般模式进行判断
if (
@ -514,15 +526,26 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
):
if begin_pattern.search(cleaned_text):
start_page = i
print(f"匹配到附录等模式,设置起始页为: {start_page}")
continue
# print(f"匹配到附录等模式,设置起始页为: {start_page}")
# 确定结束页
if start_page is not None and end_pattern.search(cleaned_text):
# 确定结束页 - 附录、附件、附表等
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 额外检查当前页面是否不包含任何 include_keywords
if not any(keyword in cleaned_text for keyword in include_keywords):
if i > start_page:
end_page = i
# print(f"找到结束页: {end_page}")
# print(f"找到结束页 (附件类): {end_page}")
break # 找到结束页后退出循环
else:
print(f"当前页面匹配附件结束模式但包含 include_keywords继续查找。页面: {i}")
# 确定结束页 - 章节标题
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
print(f"找到结束页 (章节标题): {end_page}")
break # 找到结束页后退出循环
return start_page, end_page