zbparse/flask_app/general/截取pdf通用函数.py

# flask_app/general/截取pdf通用函数.py
import concurrent.futures
import os

import fitz
import regex
from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content, create_get_text_function, \
    get_text_pypdf2
from flask_app.general.format_change import docx2pdf
import concurrent.futures

def get_start_and_common_header(input_path, end_page):
    """
    提取 PDF 文件的公共页眉和起始页索引。

    参数：
    - input_path: PDF 文件路径。
    - end_page: 遍历的结束页码。

    返回：
    - common_header: 公共页眉内容。
    - last_begin_index: 起始页的页码索引。
    """
    try:
        # 提取公共页眉（假设该函数已定义）
        common_header = extract_common_header(input_path)
    except Exception as e:
        print(f"提取公共页眉时出错: {e}")
        return "", -1

        # 定义匹配模式
    begin_pattern = regex.compile(
        r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
        regex.MULTILINE
    )
    catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)

    # 内部函数：对 PDF 每一页进行遍历处理
    def process_pages(get_text, total_pages):
        for i in range(total_pages):
            if i > end_page:
                return -1  # 超出 end_page 范围则返回 -1
            text = get_text(i)
            if text:
                cleaned_text = clean_page_content(text, common_header)
                # 如果页面中包含“目录”，则跳过
                if catalog_pattern.search(cleaned_text):
                    continue
                if begin_pattern.search(cleaned_text):
                    return i  # 找到匹配的起始页，返回页码（从0开始）
        return -1  # 未找到匹配页

    # 先尝试使用 PyPDF2 读取 PDF
    try:
        with open(input_path, "rb") as f:
            pdf_document = PdfReader(f)
            total_pages = len(pdf_document.pages)
            get_text = create_get_text_function('pypdf2', pdf_document)
            last_begin_index = process_pages(get_text, total_pages)
            return common_header, last_begin_index
    except Exception as e_pypdf2:
        print(f"get_start_and_common_header: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
        # 如果 PyPDF2 读取失败，尝试使用 PyMuPDF（fitz）
        try:
            with fitz.open(input_path) as pdf_document:
                total_pages = pdf_document.page_count
                get_text = create_get_text_function('fitz', pdf_document)
                last_begin_index = process_pages(get_text, total_pages)
                return common_header, last_begin_index
        except Exception as e_fitz:
            print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
            return common_header, -1


def check_pdf_pages(pdf_path, mode, logger):
    def generate_invalid_return():
        # 根据 mode 返回不同数量的空字符串及 pdf_path
        return (True, ['', '', '', '', '', '', pdf_path, '']) if mode == 'goods' else (
        True, ['', '', '', '', '', pdf_path, ''])

    try:
        try:
            # 尝试使用 PyPDF2 读取 PDF（新版可直接传入文件对象）
            with open(pdf_path, "rb") as f:
                pdf_document = PdfReader(f)
                num_pages = len(pdf_document.pages)
        except Exception as e_pypdf2:
            logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
            # 如果 PyPDF2 读取失败，尝试使用 PyMuPDF 读取 PDF
            with fitz.open(pdf_path) as pdf_document:
                num_pages = pdf_document.page_count

        if num_pages <= 30:
            logger.info("PDF（invalid_path）页数小于或等于30页，跳过切分逻辑。")
            return generate_invalid_return()

        # 若页数大于30页，返回 False 表示继续处理
        return False, []
    except Exception as e:
        logger.error(f"无法读取 PDF 页数: {e}")
        return generate_invalid_return()


def extract_and_save_pages(pdf_document, from_page, to_page, output_path, is_using_pypdf2):
    """
    抽取 PDF 某个范围 [from_page, to_page] 的页面并保存到 output_path。
    根据 is_using_pypdf2 判断使用 PyPDF2 还是 PyMuPDF。
    """
    if is_using_pypdf2:
        # 使用 PyPDF2
        writer = PdfWriter()
        for page_num in range(from_page, to_page + 1):
            try:
                writer.add_page(pdf_document.pages[page_num])
            except Exception as e_page:
                print(f"添加第 {page_num} 页到 writer 时出错: {e_page}")
                continue

        try:
            with open(output_path, 'wb') as f_out:
                writer.write(f_out)
            print(f"已截取并保存页面 {from_page} - {to_page} 到 {output_path}")
        except Exception as e_write:
            print(f"保存时出错: {e_write}")
            return ""
        return output_path
    else:
        # 使用 PyMuPDF
        output_doc = fitz.open()
        try:
            for page_num in range(from_page, to_page + 1):
                try:
                    output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
                except Exception as e_page:
                    print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
                    continue

            output_doc.save(output_path)
            print(f"已截取并保存页面 {from_page} - {to_page} 到 {output_path}")
            return output_path
        except Exception as e_write:
            print(f"保存时出错: {e_write}")
            return ""
        finally:
            output_doc.close()
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):

    # 若 start_page 或 end_page 无效，直接退出
    if start_page is None or end_page is None:
        print("错误: start_page 或 end_page 为 None")
        return ""
    def process_document(pdf_document, total_pages, using_pypdf2, get_text):
        # 检查页面范围
        if start_page < 0 or end_page >= total_pages or start_page > end_page:
            print(f"无效的页面范围: {start_page} 到 {end_page}")
            return ""

        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
        output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
        # 如果 output_suffix 为 'notice' 并且 start_page > 0，则额外保存“目录前”的页面
        if output_suffix == 'notice' and start_page > 0:
            before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
            toc_page = -1

            # 先遍历从 0 到 start_page - 1，尝试查找“目录”文字
            for page_num in range(min(start_page, total_pages)):
                try:
                    text = get_text(page_num)
                    cleaned_text = clean_page_content(text, common_header)
                    if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
                        toc_page = page_num
                        break
                except Exception as e_extract:
                    print(f"提取第 {page_num} 页文本时出错: {e_extract}")
                    continue

            # toc_page 找到则提取到 toc_page，否则提取到 start_page - 1
            pages_to_extract = toc_page + 1 if toc_page != -1 else start_page

            # 如果找到了页面可提取，且 pages_to_extract > 0
            if pages_to_extract > 0:
                extract_and_save_pages(pdf_document, 0, pages_to_extract - 1, before_pdf_path, using_pypdf2)

        # 最后提取并保存目标页面
        return extract_and_save_pages(pdf_document, start_page, end_page, output_pdf_path, using_pypdf2)

    # 第一阶段：尝试使用 PyPDF2
    try:
        with open(pdf_path, "rb") as pdf_file:
            # 这里构造 PdfReader，确保上下文结束后 pdf_file 会被关闭
            pdf_document = PdfReader(pdf_file)
            total_pages = len(pdf_document.pages)
            get_text = create_get_text_function('pypdf2', pdf_document)
            using_pypdf2 = True
            # 在该 with 块内调用处理函数
            return process_document(pdf_document, total_pages, using_pypdf2, get_text)
    except Exception as e_pypdf2:
        print(f"使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
    # 第二阶段：PyPDF2 失败后再尝试 PyMuPDF
    try:
        with fitz.open(pdf_path) as pdf_document:
            total_pages = pdf_document.page_count
            get_text = create_get_text_function('fitz', pdf_document)
            using_pypdf2 = False

            # 在该 with 块内调用处理函数
            return process_document(pdf_document, total_pages, using_pypdf2, get_text)
    except Exception as e_fitz:
        print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
        return ""

def is_pdf_or_doc(filename):
    # 判断文件是否为PDF或Word文档
    return filename.lower().endswith(('.pdf', '.doc', '.docx'))

def get_invalid_file(pdf_path, output_folder, common_header, begin_page):
    """
    从指定的PDF文件中提取无效部分并保存到输出文件夹中。
    begin_pattern匹配开头  end_pattern匹配结尾
    页数小于100不进行begin_pattern，默认start_page=0
    页数大于200时end_pattern从前往后匹配
    Args:
        pdf_path (str): 输入的PDF文件路径。
        output_folder (str): 提取后文件的输出文件夹路径。
        common_header (str): 公共头部文本，用于清理每页的内容。

    Returns:
        list: 包含保存的文件路径的列表，如果提取失败则返回包含空字符串的列表。
    """
    # 定义开始模式
    begin_patterns = [
        regex.compile(
            r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
            regex.MULTILINE
        ),
        regex.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
            regex.MULTILINE
        )
    ]
    # 定义结束模式
    end_patterns = [
        regex.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
            r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[：:]清标报告',
            regex.MULTILINE
        ),
        regex.compile(
            r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$",
            regex.MULTILINE
        ),
    ]

    # 定义排除模式
    exclusion_pattern = regex.compile(
        r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)

    pdf_document = None
    try:
        # 尝试使用 PyPDF2 读取 PDF，使用 with open 打开文件
        try:
            with open(pdf_path, "rb") as f:
                pdf_document = PdfReader(f)
                get_text = create_get_text_function('pypdf2', pdf_document)
                total_pages = len(pdf_document.pages)

                if total_pages <= 50:
                    print(f"PDF页数({total_pages}) <= 50，直接返回文件路径: {pdf_path}")
                    return pdf_path, 0

                # 提取并清理每页的文本内容
                page_texts = []
                for i in range(total_pages):
                    text = get_text(i)
                    cleaned_text = clean_page_content(text, common_header) if text else ""
                    page_texts.append(cleaned_text)
        except Exception as e:
            # 如果 PyPDF2 读取失败，则使用 PyMuPDF 读取
            with fitz.open(pdf_path) as pdf_document:
                total_pages = pdf_document.page_count
                get_text = create_get_text_function('fitz', pdf_document)
                if total_pages <= 50:
                    print(f"PDF页数({total_pages}) <= 50，直接返回文件路径: {pdf_path}")
                    return pdf_path, 0
                # 提取并清理每页的文本内容
                page_texts = []
                for i in range(total_pages):
                    text = get_text(i)
                    cleaned_text = clean_page_content(text, common_header) if text else ""
                    page_texts.append(cleaned_text)

        # 内部函数：查找起始页
        def find_start_page(begin_page):
            for pattern in begin_patterns:  # 需提前定义好
                for i in range(begin_page, min(begin_page + 30, total_pages)):
                    text = page_texts[i]
                    # 若存在排除模式则跳过
                    if regex.search(exclusion_pattern, text):
                        continue
                    if regex.search(pattern, text):
                        # print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
                        return i
                # print(f"未在前30页找到模式: {pattern.pattern}")
            return 0  # 默认从第一页开始

        # 内部函数：查找结束页
        def find_end_page():
            if total_pages > 200:
                # 当页数较多时，从第31页开始正向查找结束模式
                for pattern in end_patterns:  # 需提前定义好
                    for i in range(30, total_pages):
                        text = page_texts[i]
                        # 定义局部的排除模式
                        exclusion_pat = regex.compile(
                            r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$',
                            regex.MULTILINE
                        )
                        if regex.search(exclusion_pat, text):
                            continue
                        if regex.search(pattern, text):
                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
                            return i
            else:
                # 当页数较少时，从后向前查找（至少保留前30页）
                for pattern in end_patterns:
                    for i in range(total_pages - 1, 29, -1):
                        text = page_texts[i]
                        if regex.search(pattern, text):
                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
                            return i

            # 若未匹配到结束模式，设置默认结束页
            if total_pages > 200:
                print(f"未找到结束模式，总页数大于200，平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}。{pdf_path}")
                return max(100, int(total_pages * 1 / 2))
            elif 100 < total_pages <= 200:
                print("未找到结束模式，总页数在100到200之间，设置结束页为总页数的三分之二。 " + pdf_path)
                return max(100, int(total_pages * 2 / 3))
            else:
                print("未找到结束模式，设置结束页为最后一页。 " + pdf_path)
                return total_pages - 1

        # 根据总页数决定是否并发查找起始页与结束页
        if total_pages < 100:
            start_page_found = 0
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future_end = executor.submit(find_end_page)
                end_page_found = future_end.result()
        else:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future_start = executor.submit(find_start_page, begin_page)
                future_end = executor.submit(find_end_page)
                start_page_found = future_start.result()
                end_page_found = future_end.result()

        # 检查页码范围是否有效
        if start_page_found > end_page_found:
            print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})")
            return "", 0

        # 调用已实现的保存函数，保存提取的页面
        output_path = save_extracted_pages(
            pdf_path=pdf_path,
            output_folder=output_folder,
            start_page=start_page_found,
            end_page=end_page_found,
            output_suffix="invalid",
            common_header=common_header
        )
        return output_path, end_page_found

    except Exception as e:
        print(f"处理文件 {pdf_path} 时发生错误: {e}")
        return "", 0

    finally:
        # 确保释放资源（对 fitz 的 pdf_document 需要关闭）
        if pdf_document and isinstance(pdf_document, fitz.Document):
            pdf_document.close()


def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
                          exclusion_pattern=None, output_suffix="normal", invalid_endpage=-1):

    def process_pages(get_text, total_pages):
        start_page = None
        end_page = None
        flag = True
        end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
        for i in range(begin_page, end_limit):
            text = get_text(i)
            if text:
                cleaned_text = clean_page_content(text, common_header)
                if output_suffix == "tobidders_notice2":
                    catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
                    if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
                        flag = False
                        continue
                    if begin_page == 0 and catalog_pattern.search(cleaned_text):
                        continue  # 如果存在目录，跳过当前页面
                else:
                    # 防止匹配到正文前附表/正文部分
                    if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
                        flag = False
                        continue
                if start_page is None and regex.search(begin_pattern, cleaned_text):
                    if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
                        start_page = i
                        continue
                if start_page is not None:
                    if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]:
                        if regex.search(end_pattern, cleaned_text) and i > start_page:
                            end_page = i
                            break
                    else:
                        if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
                            end_page = i
                            break
        return start_page, end_page

    # 尝试使用 PyPDF2，并使用 with open 打开文件
    try:
        with open(pdf_path, "rb") as f:
            pdf_document = PdfReader(f)
            total_pages = len(pdf_document.pages)
            get_text = create_get_text_function('pypdf2', pdf_document)
            start_page, end_page = process_pages(get_text, total_pages)
            return start_page, end_page
    except Exception as e_pypdf2:
        print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")

    # 如果 PyPDF2 失败，尝试使用 PyMuPDF
    try:
        with fitz.open(pdf_path) as pdf_document:
            total_pages = pdf_document.page_count
            get_text = create_get_text_function('fitz', pdf_document)
            start_page, end_page = process_pages(get_text, total_pages)
        return start_page, end_page
    except Exception as e_fitz:
        print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
        return None, None


def generate_mid_pattern(chapter_type=None):
    """
    根据章节类型动态生成 mid_pattern。

    参数:
        chapter_type (str, optional): 章节类型，如 '章' 或 '部分'。

    返回:
        regex.Pattern: 编译后的正则表达式模式。
    """
    # 定义基础的 mid_pattern
    base_mid_pattern = (
        r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'
        r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释|定\s*义)|'
        r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
    )

    additional_mid_pattern = ''
    if chapter_type:
        if chapter_type == '章':
            additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
        elif chapter_type == '部分':
            additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'

    if additional_mid_pattern:
        combined_pattern = rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})'
    else:
        combined_pattern = base_mid_pattern

    return regex.compile(combined_pattern, regex.MULTILINE)


def generate_end_pattern(extraction_stage, chapter_type=None):
    """
    根据提取阶段和章节类型动态生成 end_pattern。

    参数:
        extraction_stage (str): 提取阶段标记，如 'first' 表示第一次，'third' 表示第三次。
        chapter_type (str, optional): 章节类型，如 '章' 或 '部分'。

    返回:
        regex.Pattern: 编译后的正则表达式模式。
    """
    # 定义公共匹配模式
    common_patterns = (
        r'^附录(?:一)?[：:]|'
        r'^附件(?:一)?[：:]|'
        r'^附表(?:一)?[：:]'
    )

    # 定义 end_pattern 模板
    end_pattern_template_first = (
            r'^第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff]+'
            r'|' + common_patterns
    )


    end_pattern_template_third = (
            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()（）/]+\s*$'
            r'|' + common_patterns
    )

    # 选择模板并替换 {chapter_type}
    if extraction_stage == 'third':
        template = end_pattern_template_third
    else:
        template = end_pattern_template_first

    if chapter_type:
        pattern_str = template.format(chapter_type=chapter_type)
    else:
        pattern_str = template.format(chapter_type='章|部分')

    return regex.compile(pattern_str, regex.MULTILINE)


def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage=-1):
    """
    从PDF文档中提取起始页、中间页和结束页。
    如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取。
    """
    exclusion_pattern = regex.compile(
        r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
    def run_extraction(begin_pattern,get_text,total_pages,extraction_stage='first'):
        """
        使用提供的 begin_pattern 运行提取过程。
        根据 extraction_stage 判断是“第一次提取”还是“第三次提取”，
        从而选择对应的默认 end_pattern（动态生成）。

        参数:
            begin_pattern (regex.Pattern): 用于识别起始的正则表达式模式。
            extraction_stage (str): 提取阶段标记，如 'first' 表示第一次，'third' 表示第三次。

        返回:
            tuple: (start_page, mid_page, end_page)
        """

        # ===== 函数内部变量 =====
        start_page = None
        mid_page = None
        end_page = None
        combined_mid_pattern = None  # 中间页的组合模式
        catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
        end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
        # ===== 遍历每一页，执行提取逻辑 =====
        for i in range(end_limit):
            try:
                text = get_text(i)
                cleaned_text = clean_page_content(text, common_header)
            except Exception as e:
                print(f"提取第 {i} 页文本时出错: {e}")
                continue

            # 如果已经找到中间页，且当前页匹配排除模式，则跳过
            if exclusion_pattern and mid_page is not None and regex.search(exclusion_pattern, cleaned_text):
                continue

            # 如果为第 0 页之后的目录，直接跳过
            if begin_page == 0 and catalog_pattern.search(cleaned_text):
                continue

            # ========== 识别起始页 ==========
            if start_page is None:
                match = regex.search(begin_pattern, cleaned_text)
                if match and i > begin_page:
                    start_page = i
                    matched_text = match.group(0)  # 获取整个匹配的文本
                    # 根据匹配到的内容识别是“章”还是“部分”，用于后续组合 mid_pattern
                    if '章' in matched_text:
                        chapter_type = '章'
                    elif '部分' in matched_text:
                        chapter_type = '部分'
                    else:
                        chapter_type = None  # 未匹配到“章”或“部分”

                    # ===== 动态生成 end_pattern_dynamic =====
                    end_pattern_dynamic = generate_end_pattern(extraction_stage, chapter_type)
                    # ========== 动态生成 mid_pattern ==========
                    combined_mid_pattern = generate_mid_pattern(chapter_type)
                    # 找到 start_page 后，继续循环，进入下一页
                    continue

            # ========== 识别中间页 ==========
            if start_page is not None and mid_page is None and combined_mid_pattern:
                if regex.search(combined_mid_pattern, cleaned_text):
                    mid_page = i

            # ========== 识别结束页 ==========
            if start_page is not None and mid_page is not None:
                # 当前使用的 end_pattern（上面根据 extraction_stage 和 chapter_type 选择好了）
                if regex.search(end_pattern_dynamic, cleaned_text):
                    if i > mid_page:
                        end_page = i
                        break

        return start_page, mid_page, end_page

    def perform_extraction_and_save(start_page, mid_page, end_page):
        path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_part1",
                                     common_header)
        if mid_page != end_page:
            path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice_part2",
                                         common_header)
        else:
            path2 = path1
        return path1, path2

    def process_extraction(get_text, total_pages):
        # 第一次提取尝试，使用初始的 begin_pattern
        begin_pattern = regex.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$|'
            r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
            regex.MULTILINE
        )
        start_page, mid_page, end_page = run_extraction(begin_pattern,get_text,total_pages, extraction_stage='first')
        if start_page is not None and mid_page is not None and end_page is not None:
            return perform_extraction_and_save(start_page, mid_page, end_page)

        print(f"第一次提取tobidders_notice失败，尝试第二次提取: {pdf_path}")
        start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(
            pdf_path, common_header, begin_page
        )
        if (start_page1 is not None and end_page1 is not None and
            start_page2 is not None and end_page2 is not None):
            if end_page1 == start_page2:
                mid_page = end_page1
                return perform_extraction_and_save(start_page1, mid_page, end_page2)
            else:
                path1 = save_extracted_pages(
                    pdf_path, output_folder, start_page1, end_page1,
                    "tobidders_notice_part1", common_header
                )
                path2 = save_extracted_pages(
                    pdf_path, output_folder, start_page2, end_page2,
                    "tobidders_notice_part2", common_header
                )
                return path1, path2

        print("第二次提取tobidders_notice失败，尝试第三次提取!")
        new_begin_pattern = regex.compile(
            r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
            regex.MULTILINE
        )
        start_page, mid_page, end_page = run_extraction(new_begin_pattern, get_text, total_pages, extraction_stage='third')
        if start_page is not None and mid_page is not None and end_page is not None:
            return perform_extraction_and_save(start_page, mid_page, end_page)
        else:
            print(f"三次提取tobidders_notice均失败!! {pdf_path}")
            return "", ""

    try:
        try:
            # 尝试使用 PyPDF2 读取 PDF 文件
            with open(pdf_path, "rb") as f:
                pdf_document = PdfReader(f)
                get_text = create_get_text_function('pypdf2', pdf_document)
                total_pages = len(pdf_document.pages)
                return process_extraction(get_text, total_pages)
        except Exception as e_pypdf2:
            print(f"extract_pages_tobidders_notice: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
            # 如果 PyPDF2 读取失败，则使用 PyMuPDF 读取 PDF 文件
            with fitz.open(pdf_path) as pdf_document:
                get_text = create_get_text_function('fitz', pdf_document)
                total_pages = pdf_document.page_count
                return process_extraction(get_text, total_pages)
    except Exception as e:
        print(f"处理文件 {pdf_path} 时发生错误: {e}")
        return "", ""


def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
    # 使用 with open 确保文件流在读取后关闭，避免文件句柄泄露
    try:
        with open(pdf_path, "rb") as f:
            pdf_document = PdfReader(f)

            # 内部函数：设置所有正则匹配模式和排除规则
            def setup_patterns():
                begin_pattern = regex.compile(
                    r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
                    regex.MULTILINE
                )
                second_begin_pattern = regex.compile(
                    r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',
                    regex.MULTILINE
                )
                # 此处 end_pattern 默认采用 second_begin_pattern
                end_pattern = second_begin_pattern
                exclusion_pattern = regex.compile(
                    r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$',
                    regex.MULTILINE
                )
                exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"]
                return begin_pattern, second_begin_pattern, end_pattern, exclusion_pattern, exclusion_words

            # 初始化正则模式和排除规则
            begin_pattern, second_begin_pattern, end_pattern, exclusion_pattern, exclusion_words = setup_patterns()
            output_suffix = "tobidders_notice"

            # 提取第一部分
            start_page1, end_page1 = extract_pages_generic(
                pdf_path, begin_pattern, end_pattern, begin_page, common_header,
                exclusion_pattern, output_suffix
            )
            if start_page1 is None or end_page1 is None:
                return None, None, None, None

            # 提取第二部分：先检查 end_page1 页的内容
            text = get_text_pypdf2(pdf_document, end_page1)
            cleaned_text = clean_page_content(text, common_header)
            match = end_pattern.search(cleaned_text)
            if match:
                # 获取匹配到的中文部分
                chapter_title = match.group(1) if match.groups() else ""
                # 检查是否包含排除关键词
                if any(word in chapter_title for word in exclusion_words):
                    # 如果包含排除关键词，则将第二部分的起始模式设置为与第一部分相同
                    second_begin_pattern = begin_pattern

            output_suffix = "tobidders_notice2"
            # 对于第二部分，传入的 begin_page 改为 end_page1 - 1（保证 start_page > begin_page）
            begin_page2 = end_page1 - 1
            start_page2, end_page2 = extract_pages_generic(
                pdf_path, second_begin_pattern, end_pattern, begin_page2, common_header,
                exclusion_pattern, output_suffix
            )
            if start_page2 is None or end_page2 is None:
                return start_page1, end_page1, end_page1, end_page1

            return start_page1, end_page1, start_page2, end_page2
    except Exception as e:
        print(f"读取 PDF 文件失败: {e}")
        return None, None, None, None

def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
    begin_pattern = regex.compile(
        r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
        regex.MULTILINE
    )
    end_pattern = regex.compile(
        r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|'  # 第一部分
        r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前瞻部分
        r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$|'  # 第二部分
        r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
        r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',  # 新增部分
        regex.MULTILINE
    )
    exclusion_pattern = regex.compile(
        r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
    output_suffix = 'notice'
    start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
                                                 exclusion_pattern, output_suffix, invalid_endpage)
    if start_page is None or end_page is None:
        print(f"{output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
    return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)


if __name__ == "__main__":
    file_path = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf'
    output_folder = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp'
    # res=get_invalid_file(file_path,output_folder,"",0)
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								# flask_app/general/截取pdf通用函数.py
-.17 无效投标、废标

											
										
										
											2024-12-17 18:44:58 +08:00
+								import concurrent.futures
 								import os
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
 								import fitz
-.17 无效投标、废标

											
										
										
											2024-12-17 18:44:58 +08:00
+								import regex
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								from PyPDF2 import PdfReader, PdfWriter
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content, create_get_text_function, \
 								    get_text_pypdf2
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								from flask_app.general.format_change import docx2pdf
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								import concurrent.futures
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								def get_start_and_common_header(input_path, end_page):
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								    """
 								    提取 PDF 文件的公共页眉和起始页索引。
 								    参数：
 								    - input_path: PDF 文件路径。
 								    - end_page: 遍历的结束页码。
 								    返回：
 								    - common_header: 公共页眉内容。
 								    - last_begin_index: 起始页的页码索引。
 								    """
 								    try:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        # 提取公共页眉（假设该函数已定义）
 								        common_header = extract_common_header(input_path)
 								    except Exception as e:
 								        print(f"提取公共页眉时出错: {e}")
 								        return "", -1
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        # 定义匹配模式
 								    begin_pattern = regex.compile(
 								        r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
 								        regex.MULTILINE
 								    )
 								    catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								    # 内部函数：对 PDF 每一页进行遍历处理
 								    def process_pages(get_text, total_pages):
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        for i in range(total_pages):
 								            if i > end_page:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								                return -1  # 超出 end_page 范围则返回 -1
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								            text = get_text(i)
 								            if text:
 								                cleaned_text = clean_page_content(text, common_header)
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								                # 如果页面中包含“目录”，则跳过
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								                if catalog_pattern.search(cleaned_text):
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								                    continue
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								                if begin_pattern.search(cleaned_text):
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								                    return i  # 找到匹配的起始页，返回页码（从0开始）
 								        return -1  # 未找到匹配页
 								    # 先尝试使用 PyPDF2 读取 PDF
 								    try:
 								        with open(input_path, "rb") as f:
 								            pdf_document = PdfReader(f)
 								            total_pages = len(pdf_document.pages)
 								            get_text = create_get_text_function('pypdf2', pdf_document)
 								            last_begin_index = process_pages(get_text, total_pages)
 								            return common_header, last_begin_index
 								    except Exception as e_pypdf2:
 								        print(f"get_start_and_common_header: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
 								        # 如果 PyPDF2 读取失败，尝试使用 PyMuPDF（fitz）
 								        try:
 								            with fitz.open(input_path) as pdf_document:
 								                total_pages = pdf_document.page_count
 								                get_text = create_get_text_function('fitz', pdf_document)
 								                last_begin_index = process_pages(get_text, total_pages)
 								                return common_header, last_begin_index
 								        except Exception as e_fitz:
 								            print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
 								            return common_header, -1
-.17 无效投标、废标

											
										
										
											2024-12-17 18:44:58 +08:00
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
 								def check_pdf_pages(pdf_path, mode, logger):
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								    def generate_invalid_return():
 								        # 根据 mode 返回不同数量的空字符串及 pdf_path
 								        return (True, ['', '', '', '', '', '', pdf_path, '']) if mode == 'goods' else (
 								        True, ['', '', '', '', '', pdf_path, ''])
-.17 无效投标、废标

											
										
										
											2024-12-17 18:44:58 +08:00
+								    try:
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        try:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            # 尝试使用 PyPDF2 读取 PDF（新版可直接传入文件对象）
 								            with open(pdf_path, "rb") as f:
 								                pdf_document = PdfReader(f)
 								                num_pages = len(pdf_document.pages)
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        except Exception as e_pypdf2:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								            logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
 								            # 如果 PyPDF2 读取失败，尝试使用 PyMuPDF 读取 PDF
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            with fitz.open(pdf_path) as pdf_document:
 								                num_pages = pdf_document.page_count
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
-.23 无效标废标更新

											
										
										
											2024-12-23 15:47:41 +08:00
+								        if num_pages <= 30:
-.2 修改bug

											
										
										
											2025-01-02 11:28:38 +08:00
+								            logger.info("PDF（invalid_path）页数小于或等于30页，跳过切分逻辑。")
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            return generate_invalid_return()
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
 								        # 若页数大于30页，返回 False 表示继续处理
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								        return False, []
-.17 无效投标、废标

											
										
										
											2024-12-17 18:44:58 +08:00
+								    except Exception as e:
 								        logger.error(f"无法读取 PDF 页数: {e}")
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        return generate_invalid_return()
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								def extract_and_save_pages(pdf_document, from_page, to_page, output_path, is_using_pypdf2):
 								    """
 								    抽取 PDF 某个范围 [from_page, to_page] 的页面并保存到 output_path。
 								    根据 is_using_pypdf2 判断使用 PyPDF2 还是 PyMuPDF。
 								    """
 								    if is_using_pypdf2:
 								        # 使用 PyPDF2
 								        writer = PdfWriter()
 								        for page_num in range(from_page, to_page + 1):
 								            try:
 								                writer.add_page(pdf_document.pages[page_num])
 								            except Exception as e_page:
 								                print(f"添加第 {page_num} 页到 writer 时出错: {e_page}")
 								                continue
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        try:
 								            with open(output_path, 'wb') as f_out:
 								                writer.write(f_out)
 								            print(f"已截取并保存页面 {from_page} - {to_page} 到 {output_path}")
 								        except Exception as e_write:
 								            print(f"保存时出错: {e_write}")
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								            return ""
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        return output_path
 								    else:
 								        # 使用 PyMuPDF
 								        output_doc = fitz.open()
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        try:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            for page_num in range(from_page, to_page + 1):
 								                try:
 								                    output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
 								                except Exception as e_page:
 								                    print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
 								                    continue
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            output_doc.save(output_path)
 								            print(f"已截取并保存页面 {from_page} - {to_page} 到 {output_path}")
 								            return output_path
 								        except Exception as e_write:
 								            print(f"保存时出错: {e_write}")
 								            return ""
 								        finally:
 								            output_doc.close()
 								def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
 								    # 若 start_page 或 end_page 无效，直接退出
 								    if start_page is None or end_page is None:
 								        print("错误: start_page 或 end_page 为 None")
 								        return ""
 								    def process_document(pdf_document, total_pages, using_pypdf2, get_text):
 								        # 检查页面范围
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								        if start_page < 0 or end_page >= total_pages or start_page > end_page:
 								            print(f"无效的页面范围: {start_page} 到 {end_page}")
 								            return ""
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
 								        output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        # 如果 output_suffix 为 'notice' 并且 start_page > 0，则额外保存“目录前”的页面
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								        if output_suffix == 'notice' and start_page > 0:
 								            before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
 								            toc_page = -1
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            # 先遍历从 0 到 start_page - 1，尝试查找“目录”文字
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								            for page_num in range(min(start_page, total_pages)):
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								                try:
 								                    text = get_text(page_num)
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								                    cleaned_text = clean_page_content(text, common_header)
 								                    if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
 								                        toc_page = page_num
 								                        break
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								                except Exception as e_extract:
 								                    print(f"提取第 {page_num} 页文本时出错: {e_extract}")
 								                    continue
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            # toc_page 找到则提取到 toc_page，否则提取到 start_page - 1
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								            pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            # 如果找到了页面可提取，且 pages_to_extract > 0
 								            if pages_to_extract > 0:
 								                extract_and_save_pages(pdf_document, 0, pages_to_extract - 1, before_pdf_path, using_pypdf2)
 								        # 最后提取并保存目标页面
 								        return extract_and_save_pages(pdf_document, start_page, end_page, output_pdf_path, using_pypdf2)
 								    # 第一阶段：尝试使用 PyPDF2
 								    try:
 								        with open(pdf_path, "rb") as pdf_file:
 								            # 这里构造 PdfReader，确保上下文结束后 pdf_file 会被关闭
 								            pdf_document = PdfReader(pdf_file)
 								            total_pages = len(pdf_document.pages)
 								            get_text = create_get_text_function('pypdf2', pdf_document)
 								            using_pypdf2 = True
 								            # 在该 with 块内调用处理函数
 								            return process_document(pdf_document, total_pages, using_pypdf2, get_text)
 								    except Exception as e_pypdf2:
 								        print(f"使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
 								    # 第二阶段：PyPDF2 失败后再尝试 PyMuPDF
 								    try:
 								        with fitz.open(pdf_path) as pdf_document:
 								            total_pages = pdf_document.page_count
 								            get_text = create_get_text_function('fitz', pdf_document)
 								            using_pypdf2 = False
 								            # 在该 with 块内调用处理函数
 								            return process_document(pdf_document, total_pages, using_pypdf2, get_text)
 								    except Exception as e_fitz:
 								        print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        return ""
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
-.18 截取pdf

											
										
										
											2024-12-18 10:32:24 +08:00
+								def is_pdf_or_doc(filename):
 								    # 判断文件是否为PDF或Word文档
 								    return filename.lower().endswith(('.pdf', '.doc', '.docx'))
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								def get_invalid_file(pdf_path, output_folder, common_header, begin_page):
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								    """
 								    从指定的PDF文件中提取无效部分并保存到输出文件夹中。
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								    begin_pattern匹配开头  end_pattern匹配结尾
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								    页数小于100不进行begin_pattern，默认start_page=0
 								    页数大于200时end_pattern从前往后匹配
 								    Args:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        pdf_path (str): 输入的PDF文件路径。
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								        output_folder (str): 提取后文件的输出文件夹路径。
 								        common_header (str): 公共头部文本，用于清理每页的内容。
 								    Returns:
 								        list: 包含保存的文件路径的列表，如果提取失败则返回包含空字符串的列表。
 								    """
 								    # 定义开始模式
 								    begin_patterns = [
-.18 截取pdf

											
										
										
											2024-12-18 14:02:46 +08:00
+								        regex.compile(
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								            r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								            regex.MULTILINE
 								        ),
 								        regex.compile(
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								            regex.MULTILINE
 								        )
-.18 截取pdf

											
										
										
											2024-12-18 14:02:46 +08:00
+								    ]
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								    # 定义结束模式
 								    end_patterns = [
 								        regex.compile(
-.21解决bug

											
										
										
											2025-01-21 19:20:26 +08:00
+								            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
-.25 解析优化

											
										
										
											2024-12-25 14:35:52 +08:00
+								            r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[：:]清标报告',
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								            regex.MULTILINE
 								        ),
 								        regex.compile(
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 15:53:31 +08:00
+								            r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$",
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								            regex.MULTILINE
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								        ),
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								    ]
 								    # 定义排除模式
 								    exclusion_pattern = regex.compile(
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								    pdf_document = None
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								    try:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        # 尝试使用 PyPDF2 读取 PDF，使用 with open 打开文件
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        try:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            with open(pdf_path, "rb") as f:
 								                pdf_document = PdfReader(f)
 								                get_text = create_get_text_function('pypdf2', pdf_document)
 								                total_pages = len(pdf_document.pages)
 								                if total_pages <= 50:
 								                    print(f"PDF页数({total_pages}) <= 50，直接返回文件路径: {pdf_path}")
 								                    return pdf_path, 0
 								                # 提取并清理每页的文本内容
 								                page_texts = []
 								                for i in range(total_pages):
 								                    text = get_text(i)
 								                    cleaned_text = clean_page_content(text, common_header) if text else ""
 								                    page_texts.append(cleaned_text)
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        except Exception as e:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            # 如果 PyPDF2 读取失败，则使用 PyMuPDF 读取
 								            with fitz.open(pdf_path) as pdf_document:
 								                total_pages = pdf_document.page_count
 								                get_text = create_get_text_function('fitz', pdf_document)
 								                if total_pages <= 50:
 								                    print(f"PDF页数({total_pages}) <= 50，直接返回文件路径: {pdf_path}")
 								                    return pdf_path, 0
 								                # 提取并清理每页的文本内容
 								                page_texts = []
 								                for i in range(total_pages):
 								                    text = get_text(i)
 								                    cleaned_text = clean_page_content(text, common_header) if text else ""
 								                    page_texts.append(cleaned_text)
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        # 内部函数：查找起始页
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								        def find_start_page(begin_page):
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								            for pattern in begin_patterns:  # 需提前定义好
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								                for i in range(begin_page, min(begin_page + 30, total_pages)):
 								                    text = page_texts[i]
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                    # 若存在排除模式则跳过
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								                    if regex.search(exclusion_pattern, text):
 								                        continue
 								                    if regex.search(pattern, text):
 								                        # print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
 								                        return i
 								                # print(f"未在前30页找到模式: {pattern.pattern}")
 								            return 0  # 默认从第一页开始
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        # 内部函数：查找结束页
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								        def find_end_page():
 								            if total_pages > 200:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                # 当页数较多时，从第31页开始正向查找结束模式
 								                for pattern in end_patterns:  # 需提前定义好
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								                    for i in range(30, total_pages):
 								                        text = page_texts[i]
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                        # 定义局部的排除模式
 								                        exclusion_pat = regex.compile(
 								                            r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$',
 								                            regex.MULTILINE
 								                        )
 								                        if regex.search(exclusion_pat, text):
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								                            continue
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								                        if regex.search(pattern, text):
 								                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
 								                            return i
 								            else:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                # 当页数较少时，从后向前查找（至少保留前30页）
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								                for pattern in end_patterns:
 								                    for i in range(total_pages - 1, 29, -1):
 								                        text = page_texts[i]
 								                        if regex.search(pattern, text):
 								                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
 								                            return i
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								            # 若未匹配到结束模式，设置默认结束页
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 15:53:31 +08:00
+								            if total_pages > 200:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                print(f"未找到结束模式，总页数大于200，平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}。{pdf_path}")
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 15:53:31 +08:00
+								                return max(100, int(total_pages * 1 / 2))
 								            elif 100 < total_pages <= 200:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                print("未找到结束模式，总页数在100到200之间，设置结束页为总页数的三分之二。 " + pdf_path)
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								                return max(100, int(total_pages * 2 / 3))
 								            else:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                print("未找到结束模式，设置结束页为最后一页。 " + pdf_path)
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								                return total_pages - 1
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        # 根据总页数决定是否并发查找起始页与结束页
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								        if total_pages < 100:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								            start_page_found = 0
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								            with concurrent.futures.ThreadPoolExecutor() as executor:
 								                future_end = executor.submit(find_end_page)
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                end_page_found = future_end.result()
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								        else:
 								            with concurrent.futures.ThreadPoolExecutor() as executor:
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                future_start = executor.submit(find_start_page, begin_page)
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								                future_end = executor.submit(find_end_page)
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                start_page_found = future_start.result()
 								                end_page_found = future_end.result()
 								        # 检查页码范围是否有效
 								        if start_page_found > end_page_found:
 								            print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})")
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            return "", 0
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        # 调用已实现的保存函数，保存提取的页面
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								        output_path = save_extracted_pages(
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								            pdf_path=pdf_path,
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								            output_folder=output_folder,
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								            start_page=start_page_found,
 								            end_page=end_page_found,
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								            output_suffix="invalid",
 								            common_header=common_header
 								        )
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        return output_path, end_page_found
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
 								    except Exception as e:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        print(f"处理文件 {pdf_path} 时发生错误: {e}")
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        return "", 0
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								    finally:
 								        # 确保释放资源（对 fitz 的 pdf_document 需要关闭）
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        if pdf_document and isinstance(pdf_document, fitz.Document):
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								            pdf_document.close()
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
 								                          exclusion_pattern=None, output_suffix="normal", invalid_endpage=-1):
 								    def process_pages(get_text, total_pages):
 								        start_page = None
 								        end_page = None
 								        flag = True
 								        end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
 								        for i in range(begin_page, end_limit):
 								            text = get_text(i)
 								            if text:
 								                cleaned_text = clean_page_content(text, common_header)
 								                if output_suffix == "tobidders_notice2":
 								                    catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
 								                    if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
 								                        flag = False
 								                        continue
 								                    if begin_page == 0 and catalog_pattern.search(cleaned_text):
 								                        continue  # 如果存在目录，跳过当前页面
 								                else:
 								                    # 防止匹配到正文前附表/正文部分
 								                    if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
 								                        flag = False
 								                        continue
 								                if start_page is None and regex.search(begin_pattern, cleaned_text):
 								                    if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
 								                        start_page = i
 								                        continue
 								                if start_page is not None:
 								                    if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]:
 								                        if regex.search(end_pattern, cleaned_text) and i > start_page:
 								                            end_page = i
 								                            break
 								                    else:
 								                        if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
 								                            end_page = i
 								                            break
 								        return start_page, end_page
 								    # 尝试使用 PyPDF2，并使用 with open 打开文件
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								    try:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        with open(pdf_path, "rb") as f:
 								            pdf_document = PdfReader(f)
 								            total_pages = len(pdf_document.pages)
 								            get_text = create_get_text_function('pypdf2', pdf_document)
 								            start_page, end_page = process_pages(get_text, total_pages)
 								            return start_page, end_page
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								    except Exception as e_pypdf2:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
 								    # 如果 PyPDF2 失败，尝试使用 PyMuPDF
 								    try:
 								        with fitz.open(pdf_path) as pdf_document:
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								            total_pages = pdf_document.page_count
 								            get_text = create_get_text_function('fitz', pdf_document)
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								            start_page, end_page = process_pages(get_text, total_pages)
 								        return start_page, end_page
 								    except Exception as e_fitz:
 								        print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
 								        return None, None
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								def generate_mid_pattern(chapter_type=None):
 								    """
 								    根据章节类型动态生成 mid_pattern。
 								    参数:
 								        chapter_type (str, optional): 章节类型，如 '章' 或 '部分'。
 								    返回:
 								        regex.Pattern: 编译后的正则表达式模式。
 								    """
 								    # 定义基础的 mid_pattern
 								    base_mid_pattern = (
 								        r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释|定\s*义)|'
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								    )
 								    additional_mid_pattern = ''
 								    if chapter_type:
 								        if chapter_type == '章':
 								            additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
 								        elif chapter_type == '部分':
 								            additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
 								    if additional_mid_pattern:
 								        combined_pattern = rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})'
 								    else:
 								        combined_pattern = base_mid_pattern
 								    return regex.compile(combined_pattern, regex.MULTILINE)
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								def generate_end_pattern(extraction_stage, chapter_type=None):
 								    """
 								    根据提取阶段和章节类型动态生成 end_pattern。
 								    参数:
 								        extraction_stage (str): 提取阶段标记，如 'first' 表示第一次，'third' 表示第三次。
 								        chapter_type (str, optional): 章节类型，如 '章' 或 '部分'。
 								    返回:
 								        regex.Pattern: 编译后的正则表达式模式。
 								    """
-.13

											
										
										
											2025-01-13 13:51:18 +08:00
+								    # 定义公共匹配模式
 								    common_patterns = (
-.15 废标+截取pdf

											
										
										
											2025-01-15 15:45:25 +08:00
+								        r'^附录(?:一)?[：:]|'
 								        r'^附件(?:一)?[：:]|'
 								        r'^附表(?:一)?[：:]'
-.13

											
										
										
											2025-01-13 13:51:18 +08:00
+								    )
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								    # 定义 end_pattern 模板
 								    end_pattern_template_first = (
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            r'^第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff]+'
 								            r'|' + common_patterns
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								    )
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								    end_pattern_template_third = (
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()（）/]+\s*$'
 								            r'|' + common_patterns
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								    )
 								    # 选择模板并替换 {chapter_type}
 								    if extraction_stage == 'third':
 								        template = end_pattern_template_third
 								    else:
 								        template = end_pattern_template_first
 								    if chapter_type:
 								        pattern_str = template.format(chapter_type=chapter_type)
 								    else:
 								        pattern_str = template.format(chapter_type='章|部分')
 								    return regex.compile(pattern_str, regex.MULTILINE)
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage=-1):
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								    """
 								    从PDF文档中提取起始页、中间页和结束页。
 								    如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取。
 								    """
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    exclusion_pattern = regex.compile(
 								        r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								    def run_extraction(begin_pattern,get_text,total_pages,extraction_stage='first'):
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								        """
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								        使用提供的 begin_pattern 运行提取过程。
 								        根据 extraction_stage 判断是“第一次提取”还是“第三次提取”，
 								        从而选择对应的默认 end_pattern（动态生成）。
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								        参数:
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								            begin_pattern (regex.Pattern): 用于识别起始的正则表达式模式。
 								            extraction_stage (str): 提取阶段标记，如 'first' 表示第一次，'third' 表示第三次。
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
 								        返回:
 								            tuple: (start_page, mid_page, end_page)
 								        """
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
 								        # ===== 函数内部变量 =====
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								        start_page = None
 								        mid_page = None
 								        end_page = None
 								        combined_mid_pattern = None  # 中间页的组合模式
 								        catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								        end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								        # ===== 遍历每一页，执行提取逻辑 =====
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								        for i in range(end_limit):
 								            try:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                text = get_text(i)
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								                cleaned_text = clean_page_content(text, common_header)
 								            except Exception as e:
 								                print(f"提取第 {i} 页文本时出错: {e}")
 								                continue
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
 								            # 如果已经找到中间页，且当前页匹配排除模式，则跳过
-.17 小bug

											
										
										
											2025-01-17 10:47:43 +08:00
+								            if exclusion_pattern and mid_page is not None and regex.search(exclusion_pattern, cleaned_text):
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								                continue
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								            # 如果为第 0 页之后的目录，直接跳过
-.13

											
										
										
											2025-01-13 13:51:18 +08:00
+								            if begin_page == 0 and catalog_pattern.search(cleaned_text):
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								                continue
 								            # ========== 识别起始页 ==========
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								            if start_page is None:
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								                match = regex.search(begin_pattern, cleaned_text)
-.13

											
										
										
											2025-01-13 13:51:18 +08:00
+								                if match and i > begin_page:
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								                    start_page = i
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								                    matched_text = match.group(0)  # 获取整个匹配的文本
 								                    # 根据匹配到的内容识别是“章”还是“部分”，用于后续组合 mid_pattern
 								                    if '章' in matched_text:
 								                        chapter_type = '章'
 								                    elif '部分' in matched_text:
 								                        chapter_type = '部分'
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								                    else:
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								                        chapter_type = None  # 未匹配到“章”或“部分”
 								                    # ===== 动态生成 end_pattern_dynamic =====
 								                    end_pattern_dynamic = generate_end_pattern(extraction_stage, chapter_type)
 								                    # ========== 动态生成 mid_pattern ==========
 								                    combined_mid_pattern = generate_mid_pattern(chapter_type)
 								                    # 找到 start_page 后，继续循环，进入下一页
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								                    continue
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								            # ========== 识别中间页 ==========
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								            if start_page is not None and mid_page is None and combined_mid_pattern:
 								                if regex.search(combined_mid_pattern, cleaned_text):
 								                    mid_page = i
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								            # ========== 识别结束页 ==========
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								            if start_page is not None and mid_page is not None:
-.13

											
										
										
											2025-01-13 11:27:52 +08:00
+								                # 当前使用的 end_pattern（上面根据 extraction_stage 和 chapter_type 选择好了）
 								                if regex.search(end_pattern_dynamic, cleaned_text):
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
+								                    if i > mid_page:
 								                        end_page = i
 								                        break
 								        return start_page, mid_page, end_page
-.10

											
										
										
											2025-01-10 17:38:55 +08:00
+								    def perform_extraction_and_save(start_page, mid_page, end_page):
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_part1",
 								                                     common_header)
-.10

											
										
										
											2025-01-10 17:38:55 +08:00
+								        if mid_page != end_page:
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								            path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice_part2",
 								                                         common_header)
-.10

											
										
										
											2025-01-10 17:38:55 +08:00
+								        else:
 								            path2 = path1
 								        return path1, path2
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								    def process_extraction(get_text, total_pages):
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        # 第一次提取尝试，使用初始的 begin_pattern
 								        begin_pattern = regex.compile(
 								            r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
 								            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$|'
 								            r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
 								            regex.MULTILINE
 								        )
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        start_page, mid_page, end_page = run_extraction(begin_pattern,get_text,total_pages, extraction_stage='first')
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        if start_page is not None and mid_page is not None and end_page is not None:
 								            return perform_extraction_and_save(start_page, mid_page, end_page)
 								        print(f"第一次提取tobidders_notice失败，尝试第二次提取: {pdf_path}")
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(
 								            pdf_path, common_header, begin_page
 								        )
 								        if (start_page1 is not None and end_page1 is not None and
 								            start_page2 is not None and end_page2 is not None):
 								            if end_page1 == start_page2:
 								                mid_page = end_page1
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								                return perform_extraction_and_save(start_page1, mid_page, end_page2)
 								            else:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								                path1 = save_extracted_pages(
 								                    pdf_path, output_folder, start_page1, end_page1,
 								                    "tobidders_notice_part1", common_header
 								                )
 								                path2 = save_extracted_pages(
 								                    pdf_path, output_folder, start_page2, end_page2,
 								                    "tobidders_notice_part2", common_header
 								                )
 								                return path1, path2
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
 								        print("第二次提取tobidders_notice失败，尝试第三次提取!")
 								        new_begin_pattern = regex.compile(
 								            r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
 								            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
 								            regex.MULTILINE
 								        )
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        start_page, mid_page, end_page = run_extraction(new_begin_pattern, get_text, total_pages, extraction_stage='third')
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								        if start_page is not None and mid_page is not None and end_page is not None:
 								            return perform_extraction_and_save(start_page, mid_page, end_page)
-.17 小bug

											
										
										
											2025-01-17 15:45:53 +08:00
+								        else:
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								            print(f"三次提取tobidders_notice均失败!! {pdf_path}")
 								            return "", ""
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
 								    try:
 								        try:
 								            # 尝试使用 PyPDF2 读取 PDF 文件
 								            with open(pdf_path, "rb") as f:
 								                pdf_document = PdfReader(f)
 								                get_text = create_get_text_function('pypdf2', pdf_document)
 								                total_pages = len(pdf_document.pages)
 								                return process_extraction(get_text, total_pages)
 								        except Exception as e_pypdf2:
 								            print(f"extract_pages_tobidders_notice: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
 								            # 如果 PyPDF2 读取失败，则使用 PyMuPDF 读取 PDF 文件
 								            with fitz.open(pdf_path) as pdf_document:
 								                get_text = create_get_text_function('fitz', pdf_document)
 								                total_pages = pdf_document.page_count
 								                return process_extraction(get_text, total_pages)
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								    except Exception as e:
 								        print(f"处理文件 {pdf_path} 时发生错误: {e}")
-.10

											
										
										
											2025-01-10 17:38:55 +08:00
+								        return "", ""
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
-												截取pdf工程/货物尝试合并

											
										
										
											2025-01-09 16:45:55 +08:00
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								    # 使用 with open 确保文件流在读取后关闭，避免文件句柄泄露
 								    try:
-.16 尝试解决内存泄漏

											
										
										
											2025-02-16 18:09:45 +08:00
+								        with open(pdf_path, "rb") as f:
 								            pdf_document = PdfReader(f)
 								            # 内部函数：设置所有正则匹配模式和排除规则
 								            def setup_patterns():
 								                begin_pattern = regex.compile(
 								                    r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
 								                    regex.MULTILINE
 								                )
 								                second_begin_pattern = regex.compile(
 								                    r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',
 								                    regex.MULTILINE
 								                )
 								                # 此处 end_pattern 默认采用 second_begin_pattern
 								                end_pattern = second_begin_pattern
 								                exclusion_pattern = regex.compile(
 								                    r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$',
 								                    regex.MULTILINE
 								                )
 								                exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"]
 								                return begin_pattern, second_begin_pattern, end_pattern, exclusion_pattern, exclusion_words
 								            # 初始化正则模式和排除规则
 								            begin_pattern, second_begin_pattern, end_pattern, exclusion_pattern, exclusion_words = setup_patterns()
 								            output_suffix = "tobidders_notice"
 								            # 提取第一部分
 								            start_page1, end_page1 = extract_pages_generic(
 								                pdf_path, begin_pattern, end_pattern, begin_page, common_header,
 								                exclusion_pattern, output_suffix
 								            )
 								            if start_page1 is None or end_page1 is None:
 								                return None, None, None, None
 								            # 提取第二部分：先检查 end_page1 页的内容
 								            text = get_text_pypdf2(pdf_document, end_page1)
 								            cleaned_text = clean_page_content(text, common_header)
 								            match = end_pattern.search(cleaned_text)
 								            if match:
 								                # 获取匹配到的中文部分
 								                chapter_title = match.group(1) if match.groups() else ""
 								                # 检查是否包含排除关键词
 								                if any(word in chapter_title for word in exclusion_words):
 								                    # 如果包含排除关键词，则将第二部分的起始模式设置为与第一部分相同
 								                    second_begin_pattern = begin_pattern
 								            output_suffix = "tobidders_notice2"
 								            # 对于第二部分，传入的 begin_page 改为 end_page1 - 1（保证 start_page > begin_page）
 								            begin_page2 = end_page1 - 1
 								            start_page2, end_page2 = extract_pages_generic(
 								                pdf_path, second_begin_pattern, end_pattern, begin_page2, common_header,
 								                exclusion_pattern, output_suffix
 								            )
 								            if start_page2 is None or end_page2 is None:
 								                return start_page1, end_page1, end_page1, end_page1
 								            return start_page1, end_page1, start_page2, end_page2
-.14 尝试解决内存泄漏问题

											
										
										
											2025-02-14 15:46:07 +08:00
+								    except Exception as e:
 								        print(f"读取 PDF 文件失败: {e}")
 								        return None, None, None, None
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
 								def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
 								    begin_pattern = regex.compile(
 								        r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
 								        regex.MULTILINE
 								    )
 								    end_pattern = regex.compile(
 								        r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|'  # 第一部分
 								        r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'  # 否定前瞻部分
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 15:53:31 +08:00
+								        r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$|'  # 第二部分
 								        r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
 								        r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',  # 新增部分
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								        regex.MULTILINE
 								    )
 								    exclusion_pattern = regex.compile(
 								        r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
 								    output_suffix = 'notice'
-.20 pypdf2读取失败可转为使用fitz库

											
										
										
											2025-01-21 14:21:49 +08:00
+								    start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								                                                 exclusion_pattern, output_suffix, invalid_endpage)
 								    if start_page is None or end_page is None:
 								        print(f"{output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
 								    return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
-.18 截取pdf

											
										
										
											2024-12-18 16:01:32 +08:00
+								if __name__ == "__main__":
-.16 无效标终极版，解决了顺序问题、跨页表格显示不全、段落+表格联动

											
										
										
											2025-01-16 17:22:49 +08:00
+								    file_path = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf'
 								    output_folder = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp'
-.14 失败的文件进行记录，截取pdf进一步优化

											
										
										
											2025-01-14 17:10:38 +08:00
+								    # res=get_invalid_file(file_path,output_folder,"",0)