截取pdf工程/货物尝试合并

2025-01-09 16:45:55 +08:00 · 2025-01-09 16:45:55 +08:00 · 6c5163b2fb
commit 6c5163b2fb
parent d3425adcfb
3 changed files with 252 additions and 275 deletions
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@ -259,6 +259,246 @@ def get_invalid_file(file_path, output_folder, common_header,begin_page):
        print(f"处理文件 {file_path} 时发生错误: {e}")
        return ""

+def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
+                          output_suffix="normal"):
+    start_page = None
+    end_page = None
+    flag=True
+    for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
+        text = page.extract_text() or ""
+        cleaned_text = clean_page_content(text, common_header)
+        if output_suffix == "tobidders_notice":
+            catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
+            if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
+                flag=False
+                continue
+            if begin_page==0 and catalog_pattern.search(cleaned_text):
+                continue  # 如果存在目录，跳过当前页面
+        else:
+            #一般投标文件的编制、组成在'投标人须知前附表/正文'中，需要防止begin_pattern匹配到这部分内容，所以在start_page is None的时候使用该exclusion_pattern
+            if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
+                flag=False
+                continue
+        if start_page is None and regex.search(begin_pattern, cleaned_text):
+            if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
+                start_page = i
+                continue
+        if start_page is not None:
+            if output_suffix == "tobidders_notice":
+                if regex.search(end_pattern, cleaned_text) and i > start_page:
+                    end_page = i
+                    break
+            else:
+                if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
+                    end_page = i
+                    break
+    return start_page, end_page
+
+def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
+    """
+    从PDF文档中提取起始页、中间页和结束页。
+
+    如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取。
+
+    参数:
+        pdf_document (PDFDocument): 要处理的PDF文档对象。
+        begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
+        begin_page (int): 开始搜索的页码。
+        common_header (str): 每页需要清理的公共头部文本。
+        exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。
+
+    返回:
+        tuple: (start_page, mid_page, end_page) 如果成功，否则 (None, None, None)
+    """
+
+    def run_extraction(local_begin_pattern, local_end_pattern=None):
+        """
+        使用提供的 begin 和 end 模式运行提取过程。
+
+        如果未提供 local_end_pattern，则根据匹配的章节类型动态生成 end_pattern。
+
+        参数:
+            local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
+            local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
+
+        返回:
+            tuple: (start_page, mid_page, end_page)
+        """
+        start_page = None
+        mid_page = None
+        end_page = None
+        combined_mid_pattern = None  # 中间页的组合模式
+        pdf_document = PdfReader(pdf_path)
+        catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
+        for i, page in enumerate(pdf_document.pages):
+            text = page.extract_text() or ""
+            cleaned_text = clean_page_content(text, common_header)
+
+            # 如果已经找到中间页，且当前页匹配排除模式，则跳过
+            if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
+                continue
+            if begin_page==0 and catalog_pattern.search(cleaned_text):
+                continue  # 如果存在目录，跳过当前页面
+
+            # 识别起始页
+            if start_page is None:
+                match = regex.search(local_begin_pattern, cleaned_text)
+                if match and i > begin_page:
+                    start_page = i
+                    matched_text:str = match.group(0)  # 获取整个匹配的文本
+                    # 如果未提供固定的 end_pattern，则根据匹配的章节类型动态生成
+                    if not local_end_pattern:
+                        if '章' in matched_text:
+                            chapter_type = '章'
+                        elif '部分' in matched_text:
+                            chapter_type = '部分'
+                        else:
+                            chapter_type = None  # 未匹配到“章”或“部分”
+
+                        if chapter_type:
+                            # 根据 chapter_type 动态生成 end_pattern
+                            end_pattern_dynamic = regex.compile(
+                                rf'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
+                                regex.MULTILINE
+                            )
+
+                            # 根据 chapter_type 动态生成 additional_mid_pattern
+                            if chapter_type == '章':
+                                additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
+                            elif chapter_type == '部分':
+                                additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
+                            else:
+                                additional_mid_pattern = ''
+
+                            # 定义基础的 mid_pattern
+                            base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
+                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
+                                               r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人)\s*须知正文\s*$'
+
+                            # 合并基础模式和额外模式
+                            if additional_mid_pattern:
+                                combined_mid_pattern = regex.compile(
+                                    rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
+                                    regex.MULTILINE
+                                )
+                            else:
+                                combined_mid_pattern = regex.compile(
+                                    rf'{base_mid_pattern}',
+                                    regex.MULTILINE
+                                )
+                        else:
+                            # 如果未匹配到“章”或“部分”，使用默认的 end_pattern 和 mid_pattern
+                            end_pattern_dynamic = regex.compile(
+                                r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
+                                regex.MULTILINE
+                            )
+
+                            # 定义基础的 mid_pattern
+                            base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
+                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
+                            combined_mid_pattern = regex.compile(
+                                rf'{base_mid_pattern}',
+                                regex.MULTILINE
+                            )
+                    else:
+                        # 如果提供了固定的 end_pattern，则使用默认的 mid_pattern
+                        base_mid_pattern = r'.*[（(]?\s*[一二12]?[)）]?\s*[、．.]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$'  # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
+                        combined_mid_pattern = regex.compile(
+                            rf'{base_mid_pattern}',
+                            regex.MULTILINE
+                        )
+                    continue
+
+            # 识别中间页
+            if start_page is not None and mid_page is None and combined_mid_pattern:
+                # if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
+                #     continue
+                if regex.search(combined_mid_pattern, cleaned_text):
+                    mid_page = i
+
+            # 识别结束页
+            if start_page is not None and mid_page is not None:
+                # 使用提供的 end_pattern 或动态生成的 end_pattern
+                current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
+                if regex.search(current_end_pattern, cleaned_text):
+                    if i > mid_page:
+                        end_page = i
+                        break
+
+        return start_page, mid_page, end_page
+
+    # 第一次提取尝试，使用初始的 begin_pattern
+    start_page, mid_page, end_page = run_extraction(begin_pattern)
+
+    # 如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取
+    if not (start_page and mid_page and end_page):
+        print(f"第二次尝试 tobidders_notice!")
+        pdf_document = PdfReader(pdf_path)
+        start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
+        if start_page and end_page and mid_page:
+            return start_page, mid_page, end_page
+        else:
+            # 定义新的 begin_pattern 和 end_pattern
+            new_begin_pattern = regex.compile(
+                r'.*(?:投标人|磋商|谈判|供应商|应答人)须知\s*$|'
+                r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人)须知前附表',
+                regex.MULTILINE
+            )
+            new_end_pattern = regex.compile(
+                r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
+                regex.MULTILINE
+            )
+            print("第三次尝试 tobidders_notice! ")
+            # 第二次提取尝试，使用新的模式
+            start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
+
+    return start_page, mid_page, end_page
+
+
+def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
+    output_suffix = "tobidders_notice"
+    begin_pattern = regex.compile(
+        r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人)须知)+',
+        regex.MULTILINE
+    )
+    end_pattern = regex.compile(
+        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE  # 捕获中文部分
+    )
+    exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"]  # 在这里添加需要排除的关键词
+
+    exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
+
+
+    # 提取第一部分
+    start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
+    if start_page1 is None or end_page1 is None:
+        return "", "", ""
+
+    # 提取第二部分
+    start_page2 = end_page1
+
+    # 检查end_page1页面的内容
+    text = pdf_document.pages[end_page1].extract_text() or ""
+    cleaned_text = clean_page_content(text, common_header)
+    match = end_pattern.search(cleaned_text)
+
+    if match:
+        # 获取匹配到的中文部分
+        chapter_title = match.group(1)
+        # 检查是否包含排除关键词
+        if any(word in chapter_title for word in exclusion_words):
+            # 如果包含排除关键词，直接返回相同的路径
+            return start_page1, end_page1, end_page1
+
+    # 如果不包含排除关键词，继续提取第二部分
+    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
+                                         exclusion_pattern, output_suffix)
+
+    if end_page2 is None:
+        return start_page1, end_page1, ""
+
+    return start_page1, end_page1, end_page2
+
 if __name__ == "__main__":
    file_path=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\ztbfile.pdf'
    output_folder=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\tmp'
--- a/flask_app/工程标/截取pdf工程标版.py
+++ b/flask_app/工程标/截取pdf工程标版.py
@ -139,25 +139,18 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
    for i in range(len(pdf_document.pages)):
        page = pdf_document.pages[i]
        text = page.extract_text()
-        if text:
-            cleaned_text = clean_page_content(text, common_header)
-            if flag and regex.search(exclusion_pattern, cleaned_text):
-                flag = False
+        cleaned_text = clean_page_content(text, common_header)
+        if flag and regex.search(exclusion_pattern, cleaned_text):
+            flag = False
+            continue
+        if start_page is None and regex.search(begin_pattern, cleaned_text):
+            if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
+                start_page = i
                continue
-            if regex.search(begin_pattern, cleaned_text) and i >= begin_page:
-                if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
-                    pass
-                else:
-                    start_page = i
-            if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,
-                                                                                                       cleaned_text):
-                condition = i > start_page
-                if condition:
-                    is_invalid_condition = output_suffix == "invalid" and i > 30  # 这边默认无效投标至少有30页
-                    if is_invalid_condition or output_suffix != "invalid":
-                        end_page = i
-                        break  # 找到结束页后退出循环
-
+        if start_page is not None:
+            if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
+                end_page = i
+                break
    # 移除对 extract_pages_twice 的调用
    if start_page is None or end_page is None:
        print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中！")
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -4,51 +4,14 @@ import os  # 用于文件和文件夹操作
 from flask_app.general.clean_pdf import clean_page_content, extract_common_header
 from flask_app.general.merge_pdfs import merge_and_cleanup
 from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
-    convert_to_pdf, get_invalid_file
+    convert_to_pdf, get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic
 from flask_app.general.通用功能函数 import get_global_logger

-def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
-                          output_suffix="normal"):
-    start_page = None
-    end_page = None
-    flag=True
-    for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
-        text = page.extract_text() or ""
-        cleaned_text = clean_page_content(text, common_header)
-        if output_suffix == "tobidders_notice":
-            catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
-            if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
-                flag=False
-                continue
-            if begin_page==0 and catalog_pattern.search(cleaned_text):
-                continue  # 如果存在目录，跳过当前页面
-        else:
-            #一般投标文件的编制、组成在'投标人须知前附表/正文'中，需要防止begin_pattern匹配到这部分内容，所以在start_page is None的时候使用该exclusion_pattern
-            if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
-                flag=False
-                continue
-        if start_page is None and regex.search(begin_pattern, cleaned_text):
-            if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
-                start_page = i
-                continue
-        if start_page is not None:
-            if output_suffix == "tobidders_notice":
-                if regex.search(end_pattern, cleaned_text) and i > start_page:
-                    end_page = i
-                    break
-            else:
-                if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
-                    end_page = i
-                    break
-    return start_page, end_page
-
-
 def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix,logger):
    try:
        common_header = extract_common_header(pdf_path)
        pdf_document = PdfReader(pdf_path)
        exclusion_pattern = None
-        total_pages = len(pdf_document.pages) - 1  # 获取总页数

        if output_suffix == "tobidders_notice":
            exclusion_pattern = regex.compile(
@ -59,9 +22,6 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
            if not start_page or not mid_page or not end_page:
                print(f"三次提取tobidders_notice均失败!!{pdf_path}")
                return "", ""
-            # if start_page is None or end_page is None or mid_page is None:
-            #     print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
-            #     return extract_pages_twice_tobidders_notice(pdf_path, output_folder, output_suffix, common_header,begin_page)
            path1 = save_extracted_pages(pdf_path,output_folder,start_page, mid_page, "tobidders_notice_part1",common_header)
            if mid_page != end_page:
                path2 = save_extracted_pages(pdf_path,output_folder, mid_page, end_page,"tobidders_notice_part2",common_header)
@ -76,16 +36,6 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
                    r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
            start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
                                                         common_header, exclusion_pattern, output_suffix)
-            # 针对 selection = 6 的特殊处理
-            if output_suffix == "format":
-                if start_page is None:
-                    print(f"{output_suffix}: 未找到起始页，提取失败！")
-                    return ""
-                if end_page is None:
-                    # 如果未匹配到结束页，默认截取到文件末尾
-                    end_page = total_pages
-                    print(f"{output_suffix}: 未找到结束页，默认截取到文件末尾。")
-                return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
            if start_page is None or end_page is None:
                print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
                return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger)
@ -136,212 +86,6 @@ def get_patterns_for_notice():
    )
    return begin_pattern, end_pattern

-def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
-    """
-    从PDF文档中提取起始页、中间页和结束页。
-
-    如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取。
-
-    参数:
-        pdf_document (PDFDocument): 要处理的PDF文档对象。
-        begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
-        begin_page (int): 开始搜索的页码。
-        common_header (str): 每页需要清理的公共头部文本。
-        exclusion_pattern (str 或 regex.Pattern): 用于排除某些页的模式。
-
-    返回:
-        tuple: (start_page, mid_page, end_page) 如果成功，否则 (None, None, None)
-    """
-
-    def run_extraction(local_begin_pattern, local_end_pattern=None):
-        """
-        使用提供的 begin 和 end 模式运行提取过程。
-
-        如果未提供 local_end_pattern，则根据匹配的章节类型动态生成 end_pattern。
-
-        参数:
-            local_begin_pattern (str 或 regex.Pattern): 用于识别起始的正则表达式模式。
-            local_end_pattern (str 或 regex.Pattern, 可选): 用于识别结束的正则表达式模式。
-
-        返回:
-            tuple: (start_page, mid_page, end_page)
-        """
-        start_page = None
-        mid_page = None
-        end_page = None
-        combined_mid_pattern = None  # 中间页的组合模式
-        pdf_document = PdfReader(pdf_path)
-        catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
-        for i, page in enumerate(pdf_document.pages):
-            text = page.extract_text() or ""
-            cleaned_text = clean_page_content(text, common_header)
-
-            # 如果已经找到中间页，且当前页匹配排除模式，则跳过
-            if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
-                continue
-            if begin_page==0 and catalog_pattern.search(cleaned_text):
-                continue  # 如果存在目录，跳过当前页面
-
-            # 识别起始页
-            if start_page is None:
-                match = regex.search(local_begin_pattern, cleaned_text)
-                if match and i > begin_page:
-                    start_page = i
-                    matched_text:str = match.group(0)  # 获取整个匹配的文本
-                    # 如果未提供固定的 end_pattern，则根据匹配的章节类型动态生成
-                    if not local_end_pattern:
-                        if '章' in matched_text:
-                            chapter_type = '章'
-                        elif '部分' in matched_text:
-                            chapter_type = '部分'
-                        else:
-                            chapter_type = None  # 未匹配到“章”或“部分”
-
-                        if chapter_type:
-                            # 根据 chapter_type 动态生成 end_pattern
-                            end_pattern_dynamic = regex.compile(
-                                rf'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
-                                regex.MULTILINE
-                            )
-
-                            # 根据 chapter_type 动态生成 additional_mid_pattern
-                            if chapter_type == '章':
-                                additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
-                            elif chapter_type == '部分':
-                                additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
-                            else:
-                                additional_mid_pattern = ''
-
-                            # 定义基础的 mid_pattern
-                            base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
-                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
-                                               r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人)\s*须知正文\s*$'
-
-                            # 合并基础模式和额外模式
-                            if additional_mid_pattern:
-                                combined_mid_pattern = regex.compile(
-                                    rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
-                                    regex.MULTILINE
-                                )
-                            else:
-                                combined_mid_pattern = regex.compile(
-                                    rf'{base_mid_pattern}',
-                                    regex.MULTILINE
-                                )
-                        else:
-                            # 如果未匹配到“章”或“部分”，使用默认的 end_pattern 和 mid_pattern
-                            end_pattern_dynamic = regex.compile(
-                                r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$',
-                                regex.MULTILINE
-                            )
-
-                            # 定义基础的 mid_pattern
-                            base_mid_pattern = r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|' \
-                                               r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
-                            combined_mid_pattern = regex.compile(
-                                rf'{base_mid_pattern}',
-                                regex.MULTILINE
-                            )
-                    else:
-                        # 如果提供了固定的 end_pattern，则使用默认的 mid_pattern
-                        base_mid_pattern = r'.*[（(]?\s*[一二12]?[)）]?\s*[、．.]*\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)\s*$'  # 可以匹配"东莞市粤隆招标有限公司编制 46一、说明"
-                        combined_mid_pattern = regex.compile(
-                            rf'{base_mid_pattern}',
-                            regex.MULTILINE
-                        )
-                    continue
-
-            # 识别中间页
-            if start_page is not None and mid_page is None and combined_mid_pattern:
-                # if (start_page + 1 == i) and regex.search(local_begin_pattern, cleaned_text):
-                #     continue
-                if regex.search(combined_mid_pattern, cleaned_text):
-                    mid_page = i
-
-            # 识别结束页
-            if start_page is not None and mid_page is not None:
-                # 使用提供的 end_pattern 或动态生成的 end_pattern
-                current_end_pattern = local_end_pattern if local_end_pattern else end_pattern_dynamic
-                if regex.search(current_end_pattern, cleaned_text):
-                    if i > mid_page:
-                        end_page = i
-                        break
-
-        return start_page, mid_page, end_page
-
-    # 第一次提取尝试，使用初始的 begin_pattern
-    start_page, mid_page, end_page = run_extraction(begin_pattern)
-
-    # 如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取
-    if not (start_page and mid_page and end_page):
-        print(f"第二次尝试 tobidders_notice!")
-        pdf_document = PdfReader(pdf_path)
-        start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)
-        if start_page and end_page and mid_page:
-            return start_page, mid_page, end_page
-        else:
-            # 定义新的 begin_pattern 和 end_pattern
-            new_begin_pattern = regex.compile(
-                r'.*(?:投标人|磋商|谈判|供应商|应答人)须知\s*$|'
-                r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人)须知前附表',
-                regex.MULTILINE
-            )
-            new_end_pattern = regex.compile(
-                r'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
-                regex.MULTILINE
-            )
-            print("第三次尝试 tobidders_notice! ")
-            # 第二次提取尝试，使用新的模式
-            start_page, mid_page, end_page = run_extraction(new_begin_pattern, new_end_pattern)
-
-    return start_page, mid_page, end_page
-
-
-def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
-    output_suffix = "tobidders_notice"
-    begin_pattern = regex.compile(
-        r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人)须知)+',
-        regex.MULTILINE
-    )
-    end_pattern = regex.compile(
-        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE  # 捕获中文部分
-    )
-    exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"]  # 在这里添加需要排除的关键词
-
-    exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')
-
-
-    # 提取第一部分
-    start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
-    if start_page1 is None or end_page1 is None:
-        return "", "", ""
-
-    # 提取第二部分
-    start_page2 = end_page1
-
-    # 检查end_page1页面的内容
-    text = pdf_document.pages[end_page1].extract_text() or ""
-    cleaned_text = clean_page_content(text, common_header)
-    match = end_pattern.search(cleaned_text)
-
-    if match:
-        # 获取匹配到的中文部分
-        chapter_title = match.group(1)
-        # 检查是否包含排除关键词
-        if any(word in chapter_title for word in exclusion_words):
-            # 如果包含排除关键词，直接返回相同的路径
-            return start_page1, end_page1, end_page1
-
-    # 如果不包含排除关键词，继续提取第二部分
-    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
-                                         exclusion_pattern, output_suffix)
-
-    if end_page2 is None:
-        return start_page1, end_page1, ""
-
-    return start_page1, end_page1, end_page2
-
-
 def extract_pages_qualification(pdf_document, begin_page, common_header):
    # 开始匹配模式，仅匹配“附录”、“附件”或“附表”
    begin_pattern = regex.compile(