zbparse/flask_app/general/截取pdf_main.py

127 lines
5.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import concurrent.futures
import os
import time
from flask_app.general.merge_pdfs import merge_selected_pdfs
from flask_app.general.截取pdf通用函数 import check_pdf_pages
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
"""
统一的 PDF 截取和合并函数,支持 'goods''engineering' 两种模式,
以及默认的全选和指定选择。
参数:
- pdf_path (str): 输入的 PDF 文件路径。
- output_folder (str): 输出文件夹路径。
- logger (Logger): 日志记录器。
- mode (str, 可选): 模式类型,支持 'goods''engineering'。默认为 'engineering'
- selections (List[int], 可选): 选择的模式列表,默认为 [1, 2, 3, 4, 5]。
返回:
- list: 截取和合并后的文件路径列表,如果截取或合并失败,则包含空字符串。
"""
if selections is None:
selections = [1, 2, 3, 4, 5]
# 确认模式有效
if mode not in ['goods', 'engineering']:
raise ValueError("mode 参数必须是 'goods''engineering'")
def handle_exception(selection):
return ["", ""] if selection == 4 else [""]
# 设置模式相关的参数和函数
if mode == 'goods':
truncate_function = truncate_pdf_main_goods
merge_mode = 'goods'
# 根据 'goods' 模式定义异常处理的逻辑
else: # mode == 'engineering'
truncate_function = truncate_pdf_main_engineering
merge_mode = 'engineering'
mode_flag = 1
num_selections = len(selections)
if num_selections < 5:
mode_flag = 2
res = check_pdf_pages(pdf_path, mode_flag, logger)
if res is not None:
return res # 返回包含空字符串的列表
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(
truncate_function,
pdf_path,
output_folder,
selection,
logger
# 如果 'goods' 模式需要额外参数,如 output_suffix可以在这里处理
)
for selection in selections
}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files:
truncate_files.extend(files)
# 无需额外处理,因为 `truncate_function` 已处理空字符串的添加和日志记录
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 根据模式和 selection 添加相应数量的空字符串
truncate_files.extend(handle_exception(selection))
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs(
output_folder,
truncate_files,
merged_output_path,
base_file_name,
mode=merge_mode
)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
logger.info(f"已成功合并 PDF 文件到 '{merged_output_path}'")
else:
# 合并失败,添加空字符串
truncate_files.append("")
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
logger.info("已截取文件路径: " + str(truncate_files))
return truncate_files
if __name__ == "__main__":
logger=get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest5.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
# selections = [1, 4] # 仅处理 selection 4、1
selections=[5]
files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections)
print(files)
# selection = 1 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
# print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()
print("耗时:" + str(end_time - start_time))