11.8 截取资格文件bug处理
This commit is contained in:
parent
1e4949e83b
commit
5c1cc1dd4b
@ -475,26 +475,37 @@ import re
|
|||||||
|
|
||||||
|
|
||||||
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
def extract_pages_qualification(pdf_document, begin_page, common_header):
|
||||||
|
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)',
|
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
|
||||||
priority_pattern = re.compile(
|
priority_pattern = re.compile(
|
||||||
r'^(资格性检查|资格审查|符合性审查)',
|
r'^(资格性检查|资格审查|符合性审查)',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
end_pattern = re.compile(
|
|
||||||
r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|' #可能出现附件与符合性审查不在同一行的情况,误判结束位置。
|
# 结束匹配模式 - 附录、附件、附表等
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
end_pattern_attachment = re.compile(
|
||||||
|
r'^(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 结束匹配模式 - 章节标题
|
||||||
|
end_pattern_chapter = re.compile(
|
||||||
|
r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
|
|
||||||
print("第二次尝试:匹配附件")
|
print("第二次尝试:匹配附件")
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查"]
|
include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查"]
|
||||||
exclude_keywords = ["声明函", "承诺函"]
|
exclude_keywords = ["声明函", "承诺函"]
|
||||||
|
|
||||||
# 从章节开始后的位置进行检查
|
# 从指定的开始页开始遍历
|
||||||
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
if text:
|
if text:
|
||||||
@ -505,6 +516,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
|||||||
if priority_match and start_page is None:
|
if priority_match and start_page is None:
|
||||||
start_page = i
|
start_page = i
|
||||||
# print(f"匹配到优先模式,设置起始页为: {start_page}")
|
# print(f"匹配到优先模式,设置起始页为: {start_page}")
|
||||||
|
continue
|
||||||
else:
|
else:
|
||||||
# 如果未匹配优先模式,则按照一般模式进行判断
|
# 如果未匹配优先模式,则按照一般模式进行判断
|
||||||
if (
|
if (
|
||||||
@ -514,15 +526,26 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
|
|||||||
):
|
):
|
||||||
if begin_pattern.search(cleaned_text):
|
if begin_pattern.search(cleaned_text):
|
||||||
start_page = i
|
start_page = i
|
||||||
print(f"匹配到附录等模式,设置起始页为: {start_page}")
|
continue
|
||||||
|
# print(f"匹配到附录等模式,设置起始页为: {start_page}")
|
||||||
|
|
||||||
# 确定结束页
|
# 确定结束页 - 附录、附件、附表等
|
||||||
if start_page is not None and end_pattern.search(cleaned_text):
|
if start_page is not None and end_pattern_attachment.search(cleaned_text):
|
||||||
|
# 额外检查当前页面是否不包含任何 include_keywords
|
||||||
|
if not any(keyword in cleaned_text for keyword in include_keywords):
|
||||||
|
if i > start_page:
|
||||||
|
end_page = i
|
||||||
|
# print(f"找到结束页 (附件类): {end_page}")
|
||||||
|
break # 找到结束页后退出循环
|
||||||
|
else:
|
||||||
|
print(f"当前页面匹配附件结束模式但包含 include_keywords,继续查找。页面: {i}")
|
||||||
|
|
||||||
|
# 确定结束页 - 章节标题
|
||||||
|
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
|
||||||
if i > start_page:
|
if i > start_page:
|
||||||
end_page = i
|
end_page = i
|
||||||
# print(f"找到结束页: {end_page}")
|
print(f"找到结束页 (章节标题): {end_page}")
|
||||||
break # 找到结束页后退出循环
|
break # 找到结束页后退出循环
|
||||||
|
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user