zbparse/flask_app/general/截取pdf_main.py

134 lines
5.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import concurrent.futures
import os
import time
from flask_app.general.merge_pdfs import merge_selected_pdfs
from flask_app.general.截取pdf通用函数 import check_pdf_pages
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
"""
统一的 PDF 截取和合并函数,支持 'goods''engineering' 两种模式,
以及默认的全选和指定选择。
参数:
- pdf_path (str): 输入的 PDF 文件路径。
- output_folder (str): 输出文件夹路径。
- logger (Logger): 日志记录器。
- mode (str, 可选): 模式类型,支持 'goods''engineering'。默认为 'engineering'
- selections (List[int], 可选): 选择的模式列表,默认为 [1, 2, 3, 4, 5]。
返回:
- list: 截取和合并后的文件路径列表,如果截取或合并失败,则包含空字符串。
"""
def handle_exception(selection):
return ["", ""] if selection == 4 else [""]
modes_config = {
"goods": {"selections": [1, 2, 3, 4, 5], "invalid_selection": 6, "truncate_func": truncate_pdf_main_goods},
"engineering": {"selections": [1, 2, 3, 4],"invalid_selection": 5, "truncate_func": truncate_pdf_main_engineering},
}
# 验证 mode 是否有效
if mode not in modes_config:
raise ValueError("mode 参数必须是 'goods''engineering'")
# 初始化 mode 参数
config = modes_config[mode]
truncate_function = config["truncate_func"]
selections = selections or config["selections"]
invalid_selection=config["invalid_selection"]
invalid_path=truncate_function(pdf_path,output_folder,invalid_selection,logger)[0]
# 检查 PDF 页数逻辑
skip, empty_return = check_pdf_pages(invalid_path, mode, logger)
if skip:
return empty_return
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(
truncate_function,
pdf_path,
output_folder,
selection,
logger,
"default"
# 如果 'goods' 模式需要额外参数,如 output_suffix可以在这里处理
)
for selection in selections
}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
# 扁平化返回的结果
if isinstance(files, list):
truncate_files.extend(files)
elif isinstance(files, str): # 如果返回单个字符串,直接添加
truncate_files.append(files)
else:
logger.warning(f"未知的返回类型: {type(files)},跳过该结果")
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 根据模式和 selection 添加相应数量的空字符串
truncate_files.extend(handle_exception(selection))
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs(
output_folder,
truncate_files,
merged_output_path,
base_file_name,
mode=mode
)
if invalid_path:
truncate_files.append(invalid_path)
logger.info(f"已添加 invalid_path: {invalid_path}")
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
logger.info(f"已成功合并 PDF 文件到 '{merged_output_path}'")
else:
# 合并失败,添加空字符串
truncate_files.append("")
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
logger.info("已截取文件路径: " + str(truncate_files))
return truncate_files
if __name__ == "__main__":
logger=get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
pdf_path=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件(实高电子显示屏).pdf'
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
# selections = [1, 4] # 仅处理 selection 4、1
selections = [1, 3, 5]
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections)
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods')
print(files)
# print(files[-1])
# print(files[-2])
# selection = 1 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
# print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()
print("耗时:" + str(end_time - start_time))