2024-12-17 18:44:58 +08:00
|
|
|
|
import concurrent.futures
|
|
|
|
|
import os
|
|
|
|
|
import time
|
|
|
|
|
from flask_app.general.merge_pdfs import merge_selected_pdfs
|
|
|
|
|
from flask_app.general.截取pdf通用函数 import check_pdf_pages
|
|
|
|
|
from flask_app.general.通用功能函数 import get_global_logger
|
|
|
|
|
from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
|
|
|
|
|
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
|
|
|
|
|
|
|
|
|
|
def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
|
|
|
|
|
"""
|
|
|
|
|
统一的 PDF 截取和合并函数,支持 'goods' 和 'engineering' 两种模式,
|
|
|
|
|
以及默认的全选和指定选择。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- pdf_path (str): 输入的 PDF 文件路径。
|
|
|
|
|
- output_folder (str): 输出文件夹路径。
|
|
|
|
|
- logger (Logger): 日志记录器。
|
|
|
|
|
- mode (str, 可选): 模式类型,支持 'goods' 和 'engineering'。默认为 'engineering'。
|
|
|
|
|
- selections (List[int], 可选): 选择的模式列表,默认为 [1, 2, 3, 4, 5]。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- list: 截取和合并后的文件路径列表,如果截取或合并失败,则包含空字符串。
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def handle_exception(selection):
|
|
|
|
|
return ["", ""] if selection == 4 else [""]
|
2024-12-18 16:01:32 +08:00
|
|
|
|
modes_config = {
|
2024-12-23 15:47:41 +08:00
|
|
|
|
"goods": {"selections": [1, 2, 3, 4, 5], "invalid_selection": 6, "truncate_func": truncate_pdf_main_goods},
|
|
|
|
|
"engineering": {"selections": [1, 2, 3, 4],"invalid_selection": 5, "truncate_func": truncate_pdf_main_engineering},
|
2024-12-18 16:01:32 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 验证 mode 是否有效
|
|
|
|
|
if mode not in modes_config:
|
|
|
|
|
raise ValueError("mode 参数必须是 'goods' 或 'engineering'")
|
|
|
|
|
|
|
|
|
|
# 初始化 mode 参数
|
|
|
|
|
config = modes_config[mode]
|
|
|
|
|
truncate_function = config["truncate_func"]
|
|
|
|
|
selections = selections or config["selections"]
|
2024-12-23 15:47:41 +08:00
|
|
|
|
invalid_selection=config["invalid_selection"]
|
2025-01-14 17:10:38 +08:00
|
|
|
|
invalid_result=truncate_function(pdf_path,output_folder,invalid_selection,logger)
|
|
|
|
|
invalid_path=invalid_result[0]
|
|
|
|
|
invalid_endpage=invalid_result[1]
|
|
|
|
|
end_page = invalid_endpage if invalid_endpage > 120 else -1 #一般情况下Invalid_path的end_page之后都是不需要的信息,无需进行正则匹配
|
2024-12-18 16:01:32 +08:00
|
|
|
|
# 检查 PDF 页数逻辑
|
2025-01-23 14:01:21 +08:00
|
|
|
|
skip, empty_return = check_pdf_pages(invalid_path, mode, logger) #页数少于30不切分
|
2024-12-18 16:01:32 +08:00
|
|
|
|
if skip:
|
|
|
|
|
return empty_return
|
2024-12-17 18:44:58 +08:00
|
|
|
|
|
|
|
|
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名
|
|
|
|
|
truncate_files = []
|
|
|
|
|
|
|
|
|
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
2024-12-18 16:01:32 +08:00
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
2024-12-17 18:44:58 +08:00
|
|
|
|
# 提交所有任务并保持 selection 顺序
|
|
|
|
|
future_to_selection = {
|
|
|
|
|
selection: executor.submit(
|
|
|
|
|
truncate_function,
|
|
|
|
|
pdf_path,
|
|
|
|
|
output_folder,
|
|
|
|
|
selection,
|
2024-12-18 10:32:24 +08:00
|
|
|
|
logger,
|
2025-01-14 17:10:38 +08:00
|
|
|
|
"default",
|
|
|
|
|
end_page
|
2024-12-17 18:44:58 +08:00
|
|
|
|
# 如果 'goods' 模式需要额外参数,如 output_suffix,可以在这里处理
|
|
|
|
|
)
|
|
|
|
|
for selection in selections
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# 按 selection 顺序收集结果
|
|
|
|
|
for selection in selections:
|
|
|
|
|
future = future_to_selection.get(selection)
|
|
|
|
|
try:
|
|
|
|
|
files = future.result()
|
2024-12-18 16:01:32 +08:00
|
|
|
|
# 扁平化返回的结果
|
|
|
|
|
if isinstance(files, list):
|
2024-12-17 18:44:58 +08:00
|
|
|
|
truncate_files.extend(files)
|
2024-12-18 16:01:32 +08:00
|
|
|
|
elif isinstance(files, str): # 如果返回单个字符串,直接添加
|
|
|
|
|
truncate_files.append(files)
|
|
|
|
|
else:
|
|
|
|
|
logger.warning(f"未知的返回类型: {type(files)},跳过该结果")
|
2024-12-17 18:44:58 +08:00
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"Selection {selection} 生成了一个异常: {e}")
|
|
|
|
|
# 根据模式和 selection 添加相应数量的空字符串
|
|
|
|
|
truncate_files.extend(handle_exception(selection))
|
|
|
|
|
# 定义合并后的输出路径
|
|
|
|
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
|
|
|
|
# 调用 merge_selected_pdfs 并获取返回值
|
|
|
|
|
merged_path = merge_selected_pdfs(
|
|
|
|
|
output_folder,
|
|
|
|
|
truncate_files,
|
|
|
|
|
merged_output_path,
|
|
|
|
|
base_file_name,
|
2024-12-18 16:01:32 +08:00
|
|
|
|
mode=mode
|
2024-12-17 18:44:58 +08:00
|
|
|
|
)
|
2024-12-23 15:47:41 +08:00
|
|
|
|
if invalid_path:
|
|
|
|
|
truncate_files.append(invalid_path)
|
2024-12-17 18:44:58 +08:00
|
|
|
|
if merged_path:
|
|
|
|
|
# 合并成功,添加合并后的文件路径
|
|
|
|
|
truncate_files.append(merged_path)
|
|
|
|
|
else:
|
|
|
|
|
# 合并失败,添加空字符串
|
|
|
|
|
truncate_files.append("")
|
|
|
|
|
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
|
|
|
|
|
logger.info("已截取文件路径: " + str(truncate_files))
|
|
|
|
|
return truncate_files
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
logger=get_global_logger("123")
|
|
|
|
|
start_time = time.time()
|
|
|
|
|
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
2024-12-18 16:01:32 +08:00
|
|
|
|
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
2024-12-23 15:47:41 +08:00
|
|
|
|
# pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
2025-02-13 22:22:04 +08:00
|
|
|
|
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\00530231-d5a1-44a3-8639-6d660d9ab509\ztbfile.pdf"
|
2025-01-14 17:10:38 +08:00
|
|
|
|
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
2024-12-17 18:44:58 +08:00
|
|
|
|
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
2025-02-13 22:22:04 +08:00
|
|
|
|
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\00530231-d5a1-44a3-8639-6d660d9ab509\tmp"
|
2024-12-18 11:31:47 +08:00
|
|
|
|
# selections = [1, 4] # 仅处理 selection 4、1
|
2025-01-02 11:28:38 +08:00
|
|
|
|
# selections = [1, 2, 3, 5]
|
2025-01-03 17:36:23 +08:00
|
|
|
|
# files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'goods', selections) #engineering
|
2025-02-13 15:42:52 +08:00
|
|
|
|
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
|
2024-12-17 18:44:58 +08:00
|
|
|
|
print(files)
|
2024-12-23 15:47:41 +08:00
|
|
|
|
# print(files[-1])
|
|
|
|
|
# print(files[-2])
|
2024-12-17 18:44:58 +08:00
|
|
|
|
# selection = 1 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
|
|
|
|
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
|
|
|
|
|
# print(generated_files)
|
|
|
|
|
# print("生成的文件:", generated_files)
|
|
|
|
|
end_time = time.time()
|
|
|
|
|
print("耗时:" + str(end_time - start_time))
|