zbparse/flask_app/工程标/截取pdf工程标版.py

# flask_app/工程标/截取pdf工程标版.py
import regex
import os
import time
from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.clean_pdf import clean_page_content

from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \
    extract_pages_tobidders_notice
from flask_app.general.通用功能函数 import get_global_logger

def extract_pages(pdf_path, output_folder, begin_pattern, begin_page,end_pattern, output_suffix, common_header,invalid_endpage=-1):
    # 打开PDF文件
    pdf_document = PdfReader(pdf_path)
    start_page = None
    end_page = None
    flag = True
    exclusion_pattern = regex.compile(
        r'文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制')
    # 确定遍历范围
    total_pages = len(pdf_document.pages)
    end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
    # 遍历文档的每一页，查找开始和结束短语的位置
    for i in range(end_limit):
        page = pdf_document.pages[i]
        text = page.extract_text()
        cleaned_text = clean_page_content(text, common_header)
        if flag and regex.search(exclusion_pattern, cleaned_text):
            flag = False
            continue
        if start_page is None and regex.search(begin_pattern, cleaned_text):
            if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
                start_page = i
                continue
        if start_page is not None:
            if regex.search(end_pattern, cleaned_text) and (i > start_page) and not regex.search(begin_pattern, cleaned_text):
                end_page = i
                break
    # 移除对 extract_pages_twice 的调用
    if start_page is None or end_page is None:
        print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中！")
        return ""
    else:
        return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)


def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
    """
    处理 PDF 文件，作为 truncate_pdf_main 的后备方法。
    此函数将在所有模式对都失败后调用。
    """
    try:
        pdf_document = PdfReader(pdf_path)
        if output_suffix == "qualification":
            print("最后一次尝试获取qualification!")
            # 动态设置 include_keys
            include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力","信誉"]
            # 定义起始匹配模式，仅匹配“附录”、“附件”或“附表”
            begin_pattern = r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:])'
            # 定义结束匹配模式 - 附录、附件、附表等（移除负向前瞻）
            end_pattern_attachment = regex.compile(
                r'^(?:附录[一二三四五六七八九1-9]*[：:]|附件[一二三四五六七八九1-9]*[：:]|附表[一二三四五六七八九1-9]*[：:]).*$',
                regex.MULTILINE
            )
            # 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
            end_pattern_chapter = regex.compile(
                r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
                regex.MULTILINE
            )
            start_page = None
            end_page = None
            # 从指定的开始页开始遍历
            for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
                text = page.extract_text()
                if text:
                    cleaned_text = clean_page_content(text, common_header)
                    # 确定起始页，需在last_begin_index之后
                    if any(key in cleaned_text for key in include_keys):
                        if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
                            if start_page is None:
                                start_page = i  # 确保起始页不小于章节的开始页码
                                continue
                                # print(f"匹配到附录/附件/附表，设置起始页为: {start_page}")
                    # 确定结束页 - 附录、附件、附表等
                    if start_page is not None and end_pattern_attachment.search(cleaned_text):
                        # 额外检查当前页面是否不包含任何 include_keys
                        if not any(key in cleaned_text for key in include_keys):
                            if i > start_page:
                                end_page = i
                                # print(f"找到结束页 (附件类): {end_page}")
                                break  # 找到结束页后退出循环
                        else:
                            print(f"当前页面匹配附件结束模式但包含 include_keys，继续查找。页面: {i}")
                    # 确定结束页 - 章节标题、评标附表、投标人须知等
                    elif start_page is not None and end_pattern_chapter.search(cleaned_text):
                        if i > start_page:
                            end_page = i
                            # print(f"找到结束页 (章节标题等): {end_page}")
                            break  # 找到结束页后退出循环
            if start_page is None or end_page is None:
                print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中！")
                return ""
            else:
                return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
        return ""
    except Exception as e:
        print(f"Error in extract_pages_twice: {e}")
        return ""


def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
    try:
        # 内嵌的处理单个文件的函数
        def process_single_file(pdf_path, output_folder, selection):
            try:
                # 获取起始和通用页眉
                common_header, last_begin_index = get_start_and_common_header(pdf_path, 20)
                # print(last_begin_index)
                if selection == 1:
                    # Selection 1: 招标公告
                    pattern_pairs = [
                        (
                            regex.compile(
                                r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
                            ),
                            regex.compile(
                                r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE
                            )
                        ),
                        (
                            regex.compile(
                                r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
                                regex.MULTILINE
                            ),
                            regex.compile(
                                r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商|应答人)须知(?:前附表)?\s*$',
                                regex.MULTILINE
                            )
                        )
                    ]
                    output_suffix = "notice"
                elif selection == 2:
                    # Selection 2: 评标办法
                    pattern_pairs = [
                        (
                            regex.compile(
                                r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
                                r'(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
                                r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
                                regex.MULTILINE
                            ),
                            # Alternative end pattern
                            regex.compile(
                                r'^第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|[：:]清标报告\s*$|'
                                r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$|'
                                r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',
                                regex.MULTILINE
                            )
                        ),
                        (
                            regex.compile(
                                r'(?:(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*'  # 第一种模式
                                r'(?:[\u4e00-\u9fff、()（）]*?)'
                                r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))'  # 修改的部分
                                r'[\u4e00-\u9fff、()（）]*\s*$|'
                                r'^\s*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)前附表\s*$)',
                                # 第二种模式
                                regex.MULTILINE
                            ),
                            regex.compile(
                                r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$',
                                regex.MULTILINE
                            )
                        )
                    ]
                    output_suffix = "evaluation_method"
                elif selection == 3:
                    # Selection 3: 资格审查条件
                    pattern_pairs = [
                        (
                            regex.compile(
                                r'^(?:附录(?:[一1])?[：:]|附件(?:[一1])?[：:]|附表(?:[一1])?[：:]).*(?:资质|资格).*$|'
                                r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*'
                                r'[\u4e00-\u9fff、()（）]*资格[\u4e00-\u9fff、()（）]*\s*$',
                                regex.MULTILINE
                            ),
                            regex.compile(
                                r'^(?:附录[一二三四五六七八九1-9]*[：:]|附件[一二三四五六七八九1-9]*[：:]|附表[一二三四五六七八九1-9]*[：:])(?!.*(?:资质|资格)).*|'
                                r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+',
                                regex.MULTILINE
                            )
                        )
                    ]
                    output_suffix = "qualification"
                elif selection == 4:
                    begin_page=last_begin_index
                    path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage)
                    return [path1 or "", path2 or ""]
                elif selection == 5:
                    begin_page = last_begin_index
                    invalid_path, end_page = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
                    return [invalid_path or "", end_page]
                else:
                    print("无效的选择:请选择1-5")
                    return [""]

                for begin_pattern, end_pattern in pattern_pairs:
                    begin_page = last_begin_index if last_begin_index != 0 else {
                        1: 0,  # 公告
                        2: 5,  # 评标
                        3: 5,  # 资格
                    }.get(selection, 0)
                    result = extract_pages(
                        pdf_path,
                        output_folder,
                        begin_pattern,
                        begin_page=begin_page,
                        invalid_endpage=invalid_endpage,
                        end_pattern=end_pattern,
                        output_suffix=output_suffix,
                        common_header=common_header
                    )
                    if result:
                        return [result]
                    else:
                        # print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
                        continue

                # 如果所有模式对都失败，尝试回退
                print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。")
                fallback_result = extract_pages_twice(
                    pdf_path,
                    output_folder,
                    output_suffix,
                    common_header,
                    last_begin_index
                )
                return [fallback_result or ""]

            except Exception as e:
                logger.error(f"Error in processing file '{pdf_path}': {e}")
                return [""] * (2 if selection == 4 else 1)

        # 检查是否为文件夹
        if os.path.isdir(input_path):
            generated_files = []
            for file_name in os.listdir(input_path):
                if file_name.endswith('.pdf'):
                    pdf_path = os.path.join(input_path, file_name)
                    files = process_single_file(pdf_path, output_folder, selection)
                    if isinstance(files, tuple):
                        generated_files.extend([f if f else "" for f in files])
                    else:
                        generated_files.append(files)
            return generated_files
        elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
            return process_single_file(input_path, output_folder, selection)
        else:
            print("提供的路径既不是文件夹也不是PDF文件。")
            return [""] * (2 if selection == 4 else 1)
    except Exception as e:
        print(f"Error in truncate_pdf_main_goods: {e}")
        return [""] * (2 if selection == 4 else 1)


if __name__ == "__main__":
    logger = get_global_logger("123")
    start_time = time.time()
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
    pdf_path=r"D:\flask_project\flask_app\static\output\output1\86317976-040a-4c91-87e2-7718da869fd0\tmp\ztbfile.pdf"

    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
    output_folder = r"D:\flask_project\flask_app\static\output\output1\86317976-040a-4c91-87e2-7718da869fd0\tmp"
    selection = 4  # 例如：1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
    generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection, logger)
    print(generated_files)
    # print("生成的文件:", generated_files)
    end_time = time.time()
    print("耗时：" + str(end_time - start_time))