11.8 截取资格文件bug处理
This commit is contained in:
parent
1e4949e83b
commit
5c1cc1dd4b
@ -475,26 +475,37 @@ import re
|
||||
|
||||
|
||||
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||
begin_pattern = re.compile(
|
||||
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
|
||||
priority_pattern = re.compile(
|
||||
r'^(资格性检查|资格审查|符合性审查)',
|
||||
re.MULTILINE
|
||||
)
|
||||
end_pattern = re.compile(
|
||||
r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|' #可能出现附件与符合性审查不在同一行的情况,误判结束位置。
|
||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||
|
||||
# 结束匹配模式 - 附录、附件、附表等
|
||||
end_pattern_attachment = re.compile(
|
||||
r'^(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
# 结束匹配模式 - 章节标题
|
||||
end_pattern_chapter = re.compile(
|
||||
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||
re.MULTILINE
|
||||
)
|
||||
|
||||
print("第二次尝试:匹配附件")
|
||||
start_page = None
|
||||
end_page = None
|
||||
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查"]
|
||||
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查"]
|
||||
exclude_keywords = ["声明函", "承诺函"]
|
||||
|
||||
# 从章节开始后的位置进行检查
|
||||
# 从指定的开始页开始遍历
|
||||
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
||||
text = page.extract_text()
|
||||
if text:
|
||||
@ -505,6 +516,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||
if priority_match and start_page is None:
|
||||
start_page = i
|
||||
# print(f"匹配到优先模式,设置起始页为: {start_page}")
|
||||
continue
|
||||
else:
|
||||
# 如果未匹配优先模式,则按照一般模式进行判断
|
||||
if (
|
||||
@ -514,15 +526,26 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||
):
|
||||
if begin_pattern.search(cleaned_text):
|
||||
start_page = i
|
||||
print(f"匹配到附录等模式,设置起始页为: {start_page}")
|
||||
continue
|
||||
# print(f"匹配到附录等模式,设置起始页为: {start_page}")
|
||||
|
||||
# 确定结束页
|
||||
if start_page is not None and end_pattern.search(cleaned_text):
|
||||
# 确定结束页 - 附录、附件、附表等
|
||||
if start_page is not None and end_pattern_attachment.search(cleaned_text):
|
||||
# 额外检查当前页面是否不包含任何 include_keywords
|
||||
if not any(keyword in cleaned_text for keyword in include_keywords):
|
||||
if i > start_page:
|
||||
end_page = i
|
||||
# print(f"找到结束页 (附件类): {end_page}")
|
||||
break # 找到结束页后退出循环
|
||||
else:
|
||||
print(f"当前页面匹配附件结束模式但包含 include_keywords,继续查找。页面: {i}")
|
||||
|
||||
# 确定结束页 - 章节标题
|
||||
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
|
||||
if i > start_page:
|
||||
end_page = i
|
||||
# print(f"找到结束页: {end_page}")
|
||||
print(f"找到结束页 (章节标题): {end_page}")
|
||||
break # 找到结束页后退出循环
|
||||
|
||||
return start_page, end_page
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user