diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index fec4c47..c872f0b 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -475,26 +475,37 @@ import re def extract_pages_qualification(pdf_document, begin_page, common_header): + # 开始匹配模式,仅匹配“附录”、“附件”或“附表” begin_pattern = re.compile( r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)', re.MULTILINE ) + + # 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头 priority_pattern = re.compile( r'^(资格性检查|资格审查|符合性审查)', re.MULTILINE ) - end_pattern = re.compile( - r'^(?!.*(?:资格|符合))(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$|' #可能出现附件与符合性审查不在同一行的情况,误判结束位置。 - r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', + + # 结束匹配模式 - 附录、附件、附表等 + end_pattern_attachment = re.compile( + r'^(?:附录.*?[::]|附件.*?[::]|附表.*?[::]|附件\s*\d+).*$', re.MULTILINE ) + + # 结束匹配模式 - 章节标题 + end_pattern_chapter = re.compile( + r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$', + re.MULTILINE + ) + print("第二次尝试:匹配附件") start_page = None end_page = None - include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查"] + include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查"] exclude_keywords = ["声明函", "承诺函"] - # 从章节开始后的位置进行检查 + # 从指定的开始页开始遍历 for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page): text = page.extract_text() if text: @@ -505,6 +516,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header): if priority_match and start_page is None: start_page = i # print(f"匹配到优先模式,设置起始页为: {start_page}") + continue else: # 如果未匹配优先模式,则按照一般模式进行判断 if ( @@ -514,15 +526,26 @@ def extract_pages_qualification(pdf_document, begin_page, common_header): ): if begin_pattern.search(cleaned_text): start_page = i - print(f"匹配到附录等模式,设置起始页为: {start_page}") + continue + # print(f"匹配到附录等模式,设置起始页为: {start_page}") - # 确定结束页 - if start_page is not None and end_pattern.search(cleaned_text): + # 确定结束页 - 附录、附件、附表等 + if start_page is not None and end_pattern_attachment.search(cleaned_text): + # 额外检查当前页面是否不包含任何 include_keywords + if not any(keyword in cleaned_text for keyword in include_keywords): + if i > start_page: + end_page = i + # print(f"找到结束页 (附件类): {end_page}") + break # 找到结束页后退出循环 + else: + print(f"当前页面匹配附件结束模式但包含 include_keywords,继续查找。页面: {i}") + + # 确定结束页 - 章节标题 + elif start_page is not None and end_pattern_chapter.search(cleaned_text): if i > start_page: end_page = i - # print(f"找到结束页: {end_page}") + print(f"找到结束页 (章节标题): {end_page}") break # 找到结束页后退出循环 - return start_page, end_page