11.8 截取资格文件bug处理

2024-11-08 17:09:44 +08:00 · 2024-11-08 17:09:44 +08:00 · 5c1cc1dd4b
commit 5c1cc1dd4b
parent 1e4949e83b
1 changed files with 33 additions and 10 deletions
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -475,26 +475,37 @@ import re


 def extract_pages_qualification(pdf_document, begin_page, common_header):
+    # 开始匹配模式，仅匹配“附录”、“附件”或“附表”
    begin_pattern = re.compile(
        r'^(?:附录(?:一|1)?[：:]?|附件(?:一|1)?[：:]?|附表(?:一|1)?[：:]?)',
        re.MULTILINE
    )
+
+    # 优先匹配模式，匹配以“资格性检查”、“资格审查”或“符合性审查”开头
    priority_pattern = re.compile(
        r'^(资格性检查|资格审查|符合性审查)',
        re.MULTILINE
    )
-    end_pattern = re.compile(
-        r'^(?!.*(?:资格|符合))(?:附录.*?[：:]|附件.*?[：:]|附表.*?[：:]|附件\s*\d+).*$|'    #可能出现附件与符合性审查不在同一行的情况，误判结束位置。
-        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
+
+    # 结束匹配模式 - 附录、附件、附表等
+    end_pattern_attachment = re.compile(
+        r'^(?:附录.*?[：:]|附件.*?[：:]|附表.*?[：:]|附件\s*\d+).*$',
        re.MULTILINE
    )
+
+    # 结束匹配模式 - 章节标题
+    end_pattern_chapter = re.compile(
+        r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
+        re.MULTILINE
+    )
+
    print("第二次尝试:匹配附件")
    start_page = None
    end_page = None
-    include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查"]
+    include_keywords = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查"]
    exclude_keywords = ["声明函", "承诺函"]

-    # 从章节开始后的位置进行检查
+    # 从指定的开始页开始遍历
    for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
        text = page.extract_text()
        if text:
@ -505,6 +516,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
            if priority_match and start_page is None:
                start_page = i
                # print(f"匹配到优先模式，设置起始页为: {start_page}")
+                continue
            else:
                # 如果未匹配优先模式，则按照一般模式进行判断
                if (
@ -514,15 +526,26 @@ def extract_pages_qualification(pdf_document, begin_page, common_header):
                ):
                    if begin_pattern.search(cleaned_text):
                        start_page = i
-                        print(f"匹配到附录等模式，设置起始页为: {start_page}")
+                        continue
+                        # print(f"匹配到附录等模式，设置起始页为: {start_page}")

-            # 确定结束页
-            if start_page is not None and end_pattern.search(cleaned_text):
+            # 确定结束页 - 附录、附件、附表等
+            if start_page is not None and end_pattern_attachment.search(cleaned_text):
+                # 额外检查当前页面是否不包含任何 include_keywords
+                if not any(keyword in cleaned_text for keyword in include_keywords):
                    if i > start_page:
                        end_page = i
-                    # print(f"找到结束页: {end_page}")
+                        # print(f"找到结束页 (附件类): {end_page}")
                        break  # 找到结束页后退出循环
+                else:
+                    print(f"当前页面匹配附件结束模式但包含 include_keywords，继续查找。页面: {i}")

+            # 确定结束页 - 章节标题
+            elif start_page is not None and end_pattern_chapter.search(cleaned_text):
+                if i > start_page:
+                    end_page = i
+                    print(f"找到结束页 (章节标题): {end_page}")
+                    break  # 找到结束页后退出循环
    return start_page, end_page