zbparse/flask_app/general/截取pdf_main.py

127 lines
5.2 KiB
Python
Raw Normal View History

2024-12-17 18:44:58 +08:00
import concurrent.futures
import os
import time
from flask_app.general.merge_pdfs import merge_selected_pdfs
from flask_app.general.截取pdf通用函数 import check_pdf_pages
from flask_app.general.通用功能函数 import get_global_logger
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
"""
统一的 PDF 截取和合并函数支持 'goods' 'engineering' 两种模式
以及默认的全选和指定选择
参数
- pdf_path (str): 输入的 PDF 文件路径
- output_folder (str): 输出文件夹路径
- logger (Logger): 日志记录器
- mode (str, 可选): 模式类型支持 'goods' 'engineering'默认为 'engineering'
- selections (List[int], 可选): 选择的模式列表默认为 [1, 2, 3, 4, 5]
返回:
- list: 截取和合并后的文件路径列表如果截取或合并失败则包含空字符串
"""
if selections is None:
selections = [1, 2, 3, 4, 5]
# 确认模式有效
if mode not in ['goods', 'engineering']:
raise ValueError("mode 参数必须是 'goods''engineering'")
def handle_exception(selection):
return ["", ""] if selection == 4 else [""]
# 设置模式相关的参数和函数
if mode == 'goods':
2024-12-18 10:32:24 +08:00
logger.info("call 货物标截取pdf")
2024-12-17 18:44:58 +08:00
truncate_function = truncate_pdf_main_goods
merge_mode = 'goods'
# 根据 'goods' 模式定义异常处理的逻辑
else: # mode == 'engineering'
2024-12-18 10:32:24 +08:00
logger.info("call 工程标标截取pdf")
2024-12-17 18:44:58 +08:00
truncate_function = truncate_pdf_main_engineering
merge_mode = 'engineering'
num_selections = len(selections)
2024-12-18 10:32:24 +08:00
res = check_pdf_pages(pdf_path, logger)
2024-12-17 18:44:58 +08:00
if res is not None:
return res # 返回包含空字符串的列表
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(
truncate_function,
pdf_path,
output_folder,
selection,
2024-12-18 10:32:24 +08:00
logger,
"default"
2024-12-17 18:44:58 +08:00
# 如果 'goods' 模式需要额外参数,如 output_suffix可以在这里处理
)
for selection in selections
}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files:
truncate_files.extend(files)
# 无需额外处理,因为 `truncate_function` 已处理空字符串的添加和日志记录
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 根据模式和 selection 添加相应数量的空字符串
truncate_files.extend(handle_exception(selection))
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs(
output_folder,
truncate_files,
merged_output_path,
base_file_name,
mode=merge_mode
)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
logger.info(f"已成功合并 PDF 文件到 '{merged_output_path}'")
else:
# 合并失败,添加空字符串
truncate_files.append("")
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
logger.info("已截取文件路径: " + str(truncate_files))
return truncate_files
if __name__ == "__main__":
logger=get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest5.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
2024-12-18 10:32:24 +08:00
selections = [1, 4] # 仅处理 selection 4、1
# selections=[5]
#engineering
2024-12-17 18:44:58 +08:00
files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections)
print(files)
# selection = 1 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
# print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()
print("耗时:" + str(end_time - start_time))