zbparse/flask_app/general/截取pdf_main.py

import concurrent.futures
import os
import time

from flask_app.general.merge_pdfs import merge_selected_pdfs
from flask_app.general.截取pdf通用函数 import check_pdf_pages
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods


def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
    """
    统一的 PDF 截取和合并函数，支持 'goods' 和 'engineering' 两种模式，
    以及默认的全选和指定选择。

    参数：
    - pdf_path (str): 输入的 PDF 文件路径。
    - output_folder (str): 输出文件夹路径。
    - logger (Logger): 日志记录器。
    - mode (str, 可选): 模式类型，支持 'goods' 和 'engineering'。默认为 'engineering'。
    - selections (List[int], 可选): 选择的模式列表，默认为 [1, 2, 3, 4, 5]。

    返回:
    - list: 截取和合并后的文件路径列表，如果截取或合并失败，则包含空字符串。
    """

    def handle_exception(selection):
        return ["", ""] if selection == 4 else [""]
    modes_config = {
        "goods": {"selections": [1, 2, 3, 4, 5], "invalid_selection": 6, "truncate_func": truncate_pdf_main_goods},
        "engineering": {"selections": [1, 2, 3, 4],"invalid_selection": 5,  "truncate_func": truncate_pdf_main_engineering},
    }

    # 验证 mode 是否有效
    if mode not in modes_config:
        raise ValueError("mode 参数必须是 'goods' 或 'engineering'")

    # 初始化 mode 参数
    config = modes_config[mode]
    truncate_function = config["truncate_func"]
    selections = selections or config["selections"]
    invalid_selection=config["invalid_selection"]
    invalid_result=truncate_function(pdf_path,output_folder,invalid_selection,logger)
    invalid_path=invalid_result[0]
    invalid_endpage=invalid_result[1]
    end_page = invalid_endpage if invalid_endpage > 120 else -1    #一般情况下Invalid_path的end_page之后都是不需要的信息，无需进行正则匹配
    # 检查 PDF 页数逻辑
    skip, empty_return = check_pdf_pages(invalid_path, mode, logger)
    if skip:
        return empty_return

    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]  # 纯文件名
    truncate_files = []

    # 使用 ThreadPoolExecutor 进行多线程处理
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
        # 提交所有任务并保持 selection 顺序
        future_to_selection = {
            selection: executor.submit(
                truncate_function,
                pdf_path,
                output_folder,
                selection,
                logger,
                "default",
                end_page
                # 如果 'goods' 模式需要额外参数，如 output_suffix，可以在这里处理
            )
            for selection in selections
        }

        # 按 selection 顺序收集结果
        for selection in selections:
            future = future_to_selection.get(selection)
            try:
                files = future.result()
                # 扁平化返回的结果
                if isinstance(files, list):
                    truncate_files.extend(files)
                elif isinstance(files, str):  # 如果返回单个字符串，直接添加
                    truncate_files.append(files)
                else:
                    logger.warning(f"未知的返回类型: {type(files)}，跳过该结果")
            except Exception as e:
                logger.error(f"Selection {selection} 生成了一个异常: {e}")
                # 根据模式和 selection 添加相应数量的空字符串
                truncate_files.extend(handle_exception(selection))
    # 定义合并后的输出路径
    merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")

    # 调用 merge_selected_pdfs 并获取返回值
    merged_path = merge_selected_pdfs(
        output_folder,
        truncate_files,
        merged_output_path,
        base_file_name,
        mode=mode
    )
    if invalid_path:
        truncate_files.append(invalid_path)
        logger.info(f"已添加 invalid_path: {invalid_path}")

    if merged_path:
        # 合并成功，添加合并后的文件路径
        truncate_files.append(merged_path)
        logger.info(f"已成功合并 PDF 文件到 '{merged_output_path}'。")
    else:
        # 合并失败，添加空字符串
        truncate_files.append("")
        logger.warning(f"合并失败，没有生成合并文件 for {pdf_path}")

    logger.info("已截取文件路径: " + str(truncate_files))
    return truncate_files

if __name__ == "__main__":
    logger=get_global_logger("123")
    start_time = time.time()
    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
    # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
    # pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
    pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
    output_folder = r"C:\Users\Administrator\Desktop\招标文件\output33"
    # selections = [1, 4]  # 仅处理 selection 4、1
    # selections = [1, 2, 3, 5]
    # files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections)   #engineering
    files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
    print(files)
    # print(files[-1])
    # print(files[-2])
    # selection = 1  # 例如：1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
    # generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
    # print(generated_files)
    # print("生成的文件:", generated_files)
    end_time = time.time()
    print("耗时：" + str(end_time - start_time))