zbparse/flask_app/general/截取pdf通用函数.py

#flask_app/general/截取pdf通用函数.py
import concurrent.futures
import os

import regex
from PyPDF2 import PdfReader, PdfWriter

from flask_app.general.clean_pdf import extract_common_header, clean_page_content
from flask_app.general.format_change import docx2pdf
import concurrent.futures


def get_start_and_common_header(input_path,end_page):
    common_header = extract_common_header(input_path)
    last_begin_index = 0
    begin_pattern = regex.compile(
        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
        regex.MULTILINE
    )
    # 新增目录匹配模式
    catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
    pdf_document = PdfReader(input_path)
    for i, page in enumerate(pdf_document.pages):
        if i > end_page:
            return common_header, 0  # 如果页码大于end_page，直接返回last_begin_index=0
        text = page.extract_text()
        if text:
            cleaned_text = clean_page_content(text, common_header)
            # 检查是否存在"目录"
            if catalog_pattern.search(cleaned_text):
                continue  # 如果存在目录，跳过当前页面
            if begin_pattern.search(cleaned_text):
                last_begin_index = i  # 更新第一个匹配的索引，页码从0开始
                return common_header, last_begin_index
    return common_header, last_begin_index

def check_pdf_pages(pdf_path,mode, logger):
    try:
        reader = PdfReader(pdf_path)
        num_pages = len(reader.pages)
        logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
        if num_pages <= 30:
            logger.info("PDF（invalid_path）页数小于或等于30页，跳过切分逻辑。")
            if mode=='goods':
                return True,['', '', '', '', '', '', pdf_path,'']
            else:
                return True,['', '', '', '', '', pdf_path, '']
        # 若页数大于30页，返回None表示继续处理
        return False, []
    except Exception as e:
        logger.error(f"无法读取 PDF 页数: {e}")
        # 返回空列表意味着无法执行后续处理逻辑
        if mode == 'goods':
            return True,['', '', '', '', '', '', pdf_path, '']
        else:
            return True,['', '', '', '', '', pdf_path, '']


def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header):
    try:
        # 检查 start_page 和 end_page 是否为 None
        if start_page is None or end_page is None:
            print("Error: start_page 或 end_page 为 None")
            return ""
        pdf_document = PdfReader(pdf_path)
        total_pages = len(pdf_document.pages)
        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
        output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")

        if start_page < 0 or end_page >= total_pages or start_page > end_page:
            print(f"无效的页面范围: {start_page} 到 {end_page}")
            return ""

        if output_suffix == 'notice' and start_page > 0:
            before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
            before_doc = PdfWriter()
            toc_page = -1
            # 查找目录页
            for page_num in range(min(start_page, total_pages)):
                page_text = pdf_document.pages[page_num].extract_text()
                cleaned_text = clean_page_content(page_text, common_header)
                if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
                    toc_page = page_num
                    break

            # 确定截取的页数
            pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
            # 提取页面
            for page_num in range(pages_to_extract):
                before_doc.add_page(pdf_document.pages[page_num])
            print(before_pdf_path)
            with open(before_pdf_path, 'wb') as f_before:
                before_doc.write(f_before)

        output_doc = PdfWriter()
        for page_num in range(start_page, end_page + 1):
            output_doc.add_page(pdf_document.pages[page_num])
        with open(output_pdf_path, 'wb') as f:
            output_doc.write(f)
        print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
        return output_pdf_path
    except Exception as e:
        print(f"Error in save_extracted_pages: {e}")
        return ""  # 返回空字符串

def is_pdf_or_doc(filename):
    # 判断文件是否为PDF或Word文档
    return filename.lower().endswith(('.pdf', '.doc', '.docx'))


def convert_to_pdf(file_path):
    # 假设 docx2pdf 函数已经被定义，这里仅根据文件扩展名来决定是否需要转换
    if file_path.lower().endswith(('.doc', '.docx')):
        return docx2pdf(file_path)
    return file_path


def get_invalid_file(file_path, output_folder, common_header,begin_page):
    """
    从指定的PDF文件中提取无效部分并保存到输出文件夹中。
    begin_pattern从前往后匹配  end_pattern从后往前
    页数小于100不进行begin_pattern，默认start_page=0
    页数大于200时end_pattern从前往后匹配


    Args:
        file_path (str): 输入的PDF文件路径。
        output_folder (str): 提取后文件的输出文件夹路径。
        common_header (str): 公共头部文本，用于清理每页的内容。

    Returns:
        list: 包含保存的文件路径的列表，如果提取失败则返回包含空字符串的列表。
    """
    # 定义开始模式
    begin_patterns = [
        regex.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
            regex.MULTILINE
        ),
        regex.compile(
            r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
            regex.MULTILINE
        )
    ]

    # 定义结束模式
    end_patterns = [
        regex.compile(
            r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[：:]清标报告',
            regex.MULTILINE
        ),
        regex.compile(
            r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?格式.*',
            regex.MULTILINE
        ),
        regex.compile(
            r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式\s*",
            regex.MULTILINE
        )
    ]

    # 定义排除模式
    exclusion_pattern = regex.compile(
        r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制',
        regex.MULTILINE
    )

    try:
        # 打开PDF文件
        pdf_document = PdfReader(file_path)
        total_pages = len(pdf_document.pages)
        # 新增逻辑：如果总页数 <= 50，直接返回 file_path
        if total_pages <= 50:
            print(f"PDF页数({total_pages}) <= 50，直接返回文件路径:{file_path}")
            return file_path
        # 提取并清理每页的文本内容
        page_texts = []
        for i in range(total_pages):
            page = pdf_document.pages[i]
            text = page.extract_text()
            cleaned_text = clean_page_content(text, common_header) if text else ""
            page_texts.append(cleaned_text)

        # 定义查找起始页的函数
        def find_start_page(begin_page):
            for pattern in begin_patterns:
                for i in range(begin_page, min(begin_page + 30, total_pages)):
                    text = page_texts[i]
                    if regex.search(exclusion_pattern, text):
                        continue
                    if regex.search(pattern, text):
                        # print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
                        return i
                # print(f"未在前30页找到模式: {pattern.pattern}")
            return 0  # 默认从第一页开始

        # 定义查找结束页的函数
        def find_end_page():
            if total_pages > 200:
                # print("总页数大于200页，结束页从前往后查找，跳过前30页。")
                for pattern in end_patterns:
                    for i in range(30, total_pages):
                        text = page_texts[i]
                        if regex.search(pattern, text):
                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
                            return i
            else:
                # print("结束页从后往前查找，确保只剩前30页时停止。")
                for pattern in end_patterns:
                    for i in range(total_pages - 1, 29, -1):
                        text = page_texts[i]
                        if regex.search(pattern, text):
                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
                            return i

            # 如果没有匹配到，设置end_page逻辑
            if total_pages > 100:
                print("未找到结束模式，总页数大于100，设置结束页为前100页。"+file_path)
                return max(100, int(total_pages * 2 / 3))
            else:
                print("未找到结束模式，设置结束页为最后一页。"+file_path)
                return total_pages - 1

        # 根据总页数决定是否查找起始页
        if total_pages < 100:
            # print("总页数少于100页，默认起始页为第一页。")
            start_page = 0
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future_end = executor.submit(find_end_page)
                end_page = future_end.result()
        else:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future_start = executor.submit(find_start_page,begin_page)
                future_end = executor.submit(find_end_page)
                start_page = future_start.result()
                end_page = future_end.result()

        # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")

        # 验证页码范围
        if start_page > end_page:
            print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
            return ""

        # 调用已实现的保存函数
        output_path = save_extracted_pages(
            pdf_path=file_path,
            output_folder=output_folder,
            start_page=start_page,
            end_page=end_page,
            output_suffix="invalid",
            common_header=common_header
        )

        # print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
        return output_path

    except Exception as e:
        print(f"处理文件 {file_path} 时发生错误: {e}")
        return ""

def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
                          output_suffix="normal"):
    start_page = None
    end_page = None
    flag=True
    for i, page in enumerate(pdf_document.pages[begin_page:], start=begin_page):
        text = page.extract_text() or ""
        cleaned_text = clean_page_content(text, common_header)
        if output_suffix == "tobidders_notice":
            catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
            if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
                flag=False
                continue
            if begin_page==0 and catalog_pattern.search(cleaned_text):
                continue  # 如果存在目录，跳过当前页面
        else:
            #一般投标文件的编制、组成在'投标人须知前附表/正文'中，需要防止begin_pattern匹配到这部分内容，所以在start_page is None的时候使用该exclusion_pattern
            if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
                flag=False
                continue
        if start_page is None and regex.search(begin_pattern, cleaned_text):
            if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
                start_page = i
                continue
        if start_page is not None:
            if output_suffix == "tobidders_notice":
                if regex.search(end_pattern, cleaned_text) and i > start_page:
                    end_page = i
                    break
            else:
                if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
                    end_page = i
                    break
    return start_page, end_page


def generate_end_pattern(extraction_stage, chapter_type=None):
    """
    根据提取阶段和章节类型动态生成 end_pattern。

    参数:
        extraction_stage (str): 提取阶段标记，如 'first' 表示第一次，'third' 表示第三次。
        chapter_type (str, optional): 章节类型，如 '章' 或 '部分'。

    返回:
        regex.Pattern: 编译后的正则表达式模式。
    """
    # 定义 end_pattern 模板
    end_pattern_template_first = (
        r'^(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff]+'
        r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)前附表\s*$'
        r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$'
    )

    end_pattern_template_third = (
        r'第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()（）]+\s*$'
    )

    # 选择模板并替换 {chapter_type}
    if extraction_stage == 'third':
        template = end_pattern_template_third
    else:
        template = end_pattern_template_first

    if chapter_type:
        pattern_str = template.format(chapter_type=chapter_type)
    else:
        pattern_str = template.format(chapter_type='章|部分')

    return regex.compile(pattern_str, regex.MULTILINE)


def generate_mid_pattern(chapter_type=None):
    """
    根据章节类型动态生成 mid_pattern。

    参数:
        chapter_type (str, optional): 章节类型，如 '章' 或 '部分'。

    返回:
        regex.Pattern: 编译后的正则表达式模式。
    """
    # 定义基础的 mid_pattern
    base_mid_pattern = (
        r'^\s*(?:[（(]\s*[一二12]?\s*[)）]\s*[、．.]*|'
        r'[一二12][、．.]+|[、．.]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)'
        r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
    )

    additional_mid_pattern = ''
    if chapter_type:
        if chapter_type == '章':
            additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
        elif chapter_type == '部分':
            additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'

    if additional_mid_pattern:
        combined_pattern = rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})'
    else:
        combined_pattern = base_mid_pattern

    return regex.compile(combined_pattern, regex.MULTILINE)


def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header):
    """
    从PDF文档中提取起始页、中间页和结束页。
    如果第一次提取失败，则使用新的 begin_pattern 和 end_pattern 重新提取。
    """
    exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')

    def run_extraction(begin_pattern, extraction_stage='first'):
        """
        使用提供的 begin_pattern 运行提取过程。
        根据 extraction_stage 判断是“第一次提取”还是“第三次提取”，
        从而选择对应的默认 end_pattern（动态生成）。

        参数:
            begin_pattern (regex.Pattern): 用于识别起始的正则表达式模式。
            extraction_stage (str): 提取阶段标记，如 'first' 表示第一次，'third' 表示第三次。

        返回:
            tuple: (start_page, mid_page, end_page)
        """

        # ===== 函数内部变量 =====
        start_page = None
        mid_page = None
        end_page = None
        combined_mid_pattern = None  # 中间页的组合模式

        pdf_document = PdfReader(pdf_path)
        catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)

        # ===== 遍历每一页，执行提取逻辑 =====
        for i, page in enumerate(pdf_document.pages):
            text = page.extract_text() or ""
            cleaned_text = clean_page_content(text, common_header)

            # 如果已经找到中间页，且当前页匹配排除模式，则跳过
            if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
                continue

            # 如果为第 0 页之后的目录，直接跳过
            if start_page == 0 and catalog_pattern.search(cleaned_text):
                continue

            # ========== 识别起始页 ==========
            if start_page is None:
                match = regex.search(begin_pattern, cleaned_text)
                if match and (start_page is None or i > start_page):
                    start_page = i
                    matched_text = match.group(0)  # 获取整个匹配的文本
                    # 根据匹配到的内容识别是“章”还是“部分”，用于后续组合 mid_pattern
                    if '章' in matched_text:
                        chapter_type = '章'
                    elif '部分' in matched_text:
                        chapter_type = '部分'
                    else:
                        chapter_type = None  # 未匹配到“章”或“部分”

                    # ===== 动态生成 end_pattern_dynamic =====
                    end_pattern_dynamic = generate_end_pattern(extraction_stage, chapter_type)
                    # ========== 动态生成 mid_pattern ==========
                    combined_mid_pattern = generate_mid_pattern(chapter_type)
                    # 找到 start_page 后，继续循环，进入下一页
                    continue

            # ========== 识别中间页 ==========
            if start_page is not None and mid_page is None and combined_mid_pattern:
                if regex.search(combined_mid_pattern, cleaned_text):
                    mid_page = i

            # ========== 识别结束页 ==========
            if start_page is not None and mid_page is not None:
                # 当前使用的 end_pattern（上面根据 extraction_stage 和 chapter_type 选择好了）
                if regex.search(end_pattern_dynamic, cleaned_text):
                    if i > mid_page:
                        end_page = i
                        break

        return start_page, mid_page, end_page

    def perform_extraction_and_save(start_page, mid_page, end_page):
        path1 = save_extracted_pages(pdf_path,output_folder,start_page,mid_page,"tobidders_notice_part1",common_header)
        if mid_page != end_page:
            path2 = save_extracted_pages(pdf_path,output_folder,mid_page,end_page,"tobidders_notice_part2",common_header)
        else:
            path2 = path1
        return path1, path2

    # 第一次提取尝试，使用初始的 begin_pattern
    begin_pattern = regex.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
        r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
        regex.MULTILINE
    )
    start_page, mid_page, end_page = run_extraction(begin_pattern, extraction_stage='first')
    # 如果第一次提取成功，保存并返回路径
    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)

    print(f"第一次提取tobidders_notice失败，尝试第二次提取: {pdf_path}")
    pdf_document = PdfReader(pdf_path)
    start_page, mid_page, end_page = extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page)

    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)

    print("第二次提取tobidders_notice失败，尝试第三次提取!")
    new_begin_pattern = regex.compile(
        r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
        r'(?:一\s*、\s*)?(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表',
        regex.MULTILINE
    )
    start_page, mid_page, end_page = run_extraction(new_begin_pattern, extraction_stage='third')
    if start_page is not None and mid_page is not None and end_page is not None:
        return perform_extraction_and_save(start_page, mid_page, end_page)
    else:
        # 所有提取尝试均失败
        print(f"三次提取tobidders_notice均失败!! {pdf_path}")
        return "", ""


def extract_pages_twice_tobidders_notice(pdf_document, common_header, begin_page):
    output_suffix = "tobidders_notice"
    begin_pattern = regex.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
        regex.MULTILINE
    )
    end_pattern = regex.compile(
        r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE  # 捕获中文部分
    )
    exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"]  # 在这里添加需要排除的关键词

    exclusion_pattern = regex.compile(r'文件的构成|文件的组成|文件构成|文件组成|文件的编制|文件编制')


    # 提取第一部分
    start_page1, end_page1 = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header,exclusion_pattern,output_suffix)
    if start_page1 is None or end_page1 is None:
        return "", "", ""

    # 提取第二部分
    start_page2 = end_page1

    # 检查end_page1页面的内容
    text = pdf_document.pages[end_page1].extract_text() or ""
    cleaned_text = clean_page_content(text, common_header)
    match = end_pattern.search(cleaned_text)

    if match:
        # 获取匹配到的中文部分
        chapter_title = match.group(1)
        # 检查是否包含排除关键词
        if any(word in chapter_title for word in exclusion_words):
            # 如果包含排除关键词，直接返回相同的路径
            return start_page1, end_page1, end_page1

    # 如果不包含排除关键词，继续提取第二部分
    _, end_page2 = extract_pages_generic(pdf_document, end_pattern, end_pattern, start_page2 - 1, common_header,
                                         exclusion_pattern, output_suffix)

    if end_page2 is None:
        return start_page1, end_page1, ""

    return start_page1, end_page1, end_page2

if __name__ == "__main__":
    file_path=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\ztbfile.pdf'
    output_folder=r'D:\flask_project\flask_app\static\output\output1\f91db70d-8d96-44a5-b840-27d2f1ecbe95\tmp'
    res=get_invalid_file(file_path,output_folder,"",0)