10.25切分改为多线程
This commit is contained in:
parent
7a03351f28
commit
6ecc370d27
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,7 @@ from PyPDF2 import PdfReader, PdfWriter
|
|||||||
import re # 导入正则表达式库
|
import re # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
from flask_app.general.merge_pdfs import merge_pdfs
|
from flask_app.general.merge_pdfs import merge_pdfs
|
||||||
|
import concurrent.futures
|
||||||
def clean_page_content(text, common_header):
|
def clean_page_content(text, common_header):
|
||||||
# 首先删除抬头公共部分
|
# 首先删除抬头公共部分
|
||||||
if common_header: # 确保有公共抬头才进行替换
|
if common_header: # 确保有公共抬头才进行替换
|
||||||
@ -390,15 +391,41 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
|
|||||||
print(f"合并 PDF 文件时出错: {e}")
|
print(f"合并 PDF 文件时出错: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_multiple(input_path, output_folder):
|
def truncate_pdf_multiple(input_path, output_folder):
|
||||||
|
"""
|
||||||
|
处理 PDF 文件,选择 selection 1-6 的部分,并合并结果。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_path (str): 要处理的 PDF 文件路径。
|
||||||
|
output_folder (str): 截取后的文件保存文件夹路径。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
||||||
|
"""
|
||||||
base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名
|
base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
for selection in range(1, 7):
|
selections = range(1, 7) # 选择 1 到 6
|
||||||
files = truncate_pdf_main(input_path, output_folder, selection)
|
|
||||||
if files:
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
truncate_files.extend(files)
|
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||||
else:
|
# 提交所有任务并保持 selection 顺序
|
||||||
truncate_files.append("") # 截取失败时添加空字符串
|
future_to_selection = {selection: executor.submit(truncate_pdf_main, input_path, output_folder, selection) for selection in selections}
|
||||||
|
|
||||||
|
# 按 selection 顺序收集结果
|
||||||
|
for selection in selections:
|
||||||
|
future = future_to_selection.get(selection)
|
||||||
|
try:
|
||||||
|
files = future.result()
|
||||||
|
if files and any(f for f in files):
|
||||||
|
# 过滤空字符串
|
||||||
|
valid_files = [f for f in files if f]
|
||||||
|
truncate_files.extend(valid_files)
|
||||||
|
else:
|
||||||
|
truncate_files.append("") # 截取失败时添加空字符串
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selection {selection} 生成了一个异常: {e}")
|
||||||
|
truncate_files.append("") # 发生异常时添加空字符串
|
||||||
|
|
||||||
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
||||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||||
@ -414,14 +441,16 @@ def truncate_pdf_multiple(input_path, output_folder):
|
|||||||
print(f"没有文件需要合并 for {input_path}")
|
print(f"没有文件需要合并 for {input_path}")
|
||||||
|
|
||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
|
def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
|
||||||
"""
|
"""
|
||||||
处理 PDF 文件,选择 selection 为指定部分,并合并结果。
|
处理 PDF 文件,选择指定的 selections,并合并结果。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path (str): 要处理的 PDF 文件路径。
|
pdf_path (str): 要处理的 PDF 文件路径。
|
||||||
output_folder (str): 截取后的文件保存文件夹路径。
|
output_folder (str): 截取后的文件保存文件夹路径。
|
||||||
selections (list): 需要截取的部分
|
selections (list): 需要截取的部分(例如 [4, 5])。
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
||||||
@ -430,19 +459,29 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
|
|||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
|
|
||||||
for selection in selections:
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
files = truncate_pdf_main(pdf_path, output_folder, selection)
|
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||||
if files:
|
# 提交所有任务并保持 selection 顺序
|
||||||
if isinstance(files, list):
|
future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection) for selection in selections}
|
||||||
truncate_files.extend(files)
|
|
||||||
elif isinstance(files, str):
|
# 按 selection 顺序收集结果
|
||||||
truncate_files.append(files)
|
for selection in selections:
|
||||||
else:
|
future = future_to_selection.get(selection)
|
||||||
truncate_files.append("") # 截取失败时添加空字符串
|
try:
|
||||||
print(f"截取 selection {selection} 失败,已添加空字符串。")
|
files = future.result()
|
||||||
|
if files and any(f for f in files):
|
||||||
|
# 过滤空字符串
|
||||||
|
valid_files = [f for f in files if f]
|
||||||
|
truncate_files.extend(valid_files)
|
||||||
|
else:
|
||||||
|
truncate_files.append("") # 截取失败时添加空字符串
|
||||||
|
print(f"截取 selection {selection} 失败,已添加空字符串。")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selection {selection} 生成了一个异常: {e}")
|
||||||
|
truncate_files.append("") # 发生异常时添加空字符串
|
||||||
|
|
||||||
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
|
||||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
|
||||||
merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
|
merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
|
||||||
if merged_result:
|
if merged_result:
|
||||||
truncate_files.append(merged_result)
|
truncate_files.append(merged_result)
|
||||||
@ -468,11 +507,11 @@ if __name__ == "__main__":
|
|||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\e378e002-45ff-440d-a65a-9974b5015472\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\e378e002-45ff-440d-a65a-9974b5015472\\ztbfile.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"
|
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"
|
||||||
# files=truncate_pdf_multiple(input_path,output_folder)
|
files=truncate_pdf_multiple(input_path,output_folder)
|
||||||
# selections = [5, 1] # 仅处理 selection 5、1 和 3
|
# selections = [5, 1] # 仅处理 selection 5、1 和 3
|
||||||
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||||
# print(files)
|
print(files)
|
||||||
selection = 6 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
# selection = 6 # 例如:1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(generated_files)
|
# print(generated_files)
|
||||||
# print("生成的文件:", generated_files)
|
# print("生成的文件:", generated_files)
|
@ -3,7 +3,7 @@ import re # 导入正则表达式库
|
|||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
from flask_app.general.format_change import docx2pdf
|
from flask_app.general.format_change import docx2pdf
|
||||||
from flask_app.general.merge_pdfs import merge_and_cleanup,merge_pdfs
|
from flask_app.general.merge_pdfs import merge_and_cleanup,merge_pdfs
|
||||||
|
import concurrent.futures
|
||||||
def clean_page_content(text, common_header):
|
def clean_page_content(text, common_header):
|
||||||
# 首先删除抬头公共部分
|
# 首先删除抬头公共部分
|
||||||
if common_header: # 确保有公共抬头才进行替换
|
if common_header: # 确保有公共抬头才进行替换
|
||||||
@ -431,8 +431,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
|||||||
else:
|
else:
|
||||||
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||||||
return ""
|
return ""
|
||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix,
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in extract_pages_twice: {e}")
|
print(f"Error in extract_pages_twice: {e}")
|
||||||
return ""
|
return ""
|
||||||
@ -624,14 +623,30 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
|
|||||||
def truncate_pdf_multiple(pdf_path, output_folder):
|
def truncate_pdf_multiple(pdf_path, output_folder):
|
||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
for selection in range(1, 6):
|
|
||||||
files = truncate_pdf_main(pdf_path, output_folder, selection)
|
# 定义要处理的选择范围
|
||||||
if files:
|
selections = range(1, 6)
|
||||||
truncate_files.extend(files)
|
|
||||||
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
|
# 提交所有任务并保持 selection 顺序
|
||||||
|
future_to_selection = {
|
||||||
|
selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
|
||||||
|
for selection in selections}
|
||||||
|
|
||||||
|
# 按 selection 顺序收集结果
|
||||||
|
for selection in selections:
|
||||||
|
future = future_to_selection.get(selection)
|
||||||
|
try:
|
||||||
|
files = future.result()
|
||||||
|
if files:
|
||||||
|
truncate_files.extend(files)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selection {selection} 生成了一个异常: {e}")
|
||||||
|
|
||||||
if truncate_files:
|
if truncate_files:
|
||||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||||
merge_selected_pdfs(output_folder, truncate_files, merged_output_path,base_file_name)
|
merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
|
||||||
truncate_files.append(merged_output_path)
|
truncate_files.append(merged_output_path)
|
||||||
print(f"已生成合并文件: {merged_output_path}")
|
print(f"已生成合并文件: {merged_output_path}")
|
||||||
else:
|
else:
|
||||||
@ -640,13 +655,14 @@ def truncate_pdf_multiple(pdf_path, output_folder):
|
|||||||
return truncate_files
|
return truncate_files
|
||||||
|
|
||||||
#小解析,只需要前三章内容
|
#小解析,只需要前三章内容
|
||||||
def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
|
def truncate_pdf_specific_goods(pdf_path, output_folder, selections):
|
||||||
"""
|
"""
|
||||||
处理 PDF 文件,选择 selection 为 4 和 5 的部分,并合并结果。
|
处理 PDF 文件,选择指定的 selections,并合并结果。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
pdf_path (str): 要处理的 PDF 文件路径。
|
pdf_path (str): 要处理的 PDF 文件路径。
|
||||||
output_folder (str): 截取后的文件保存文件夹路径。
|
output_folder (str): 截取后的文件保存文件夹路径。
|
||||||
|
selections (iterable): 要处理的 selection 列表。
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
list: 截取的文件路径列表,包括合并后的文件路径(如果有)。
|
||||||
@ -654,13 +670,23 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
|
|||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
truncate_files = []
|
truncate_files = []
|
||||||
|
|
||||||
for selection in selections:
|
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||||
files = truncate_pdf_main(pdf_path, output_folder, selection)
|
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||||
if files:
|
# 提交所有任务并保持 selection 顺序
|
||||||
if isinstance(files, list):
|
future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") for selection in selections}
|
||||||
truncate_files.extend(files)
|
|
||||||
elif isinstance(files, str):
|
# 按 selection 顺序收集结果
|
||||||
truncate_files.append(files)
|
for selection in selections:
|
||||||
|
future = future_to_selection.get(selection)
|
||||||
|
try:
|
||||||
|
files = future.result()
|
||||||
|
if files:
|
||||||
|
if isinstance(files, list):
|
||||||
|
truncate_files.extend(files)
|
||||||
|
elif isinstance(files, str):
|
||||||
|
truncate_files.append(files)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Selection {selection} 生成了一个异常: {e}")
|
||||||
|
|
||||||
if truncate_files:
|
if truncate_files:
|
||||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
|
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
|
||||||
@ -675,16 +701,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
|
|||||||
|
|
||||||
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标
|
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(广水市教育局封闭管理项目二次).pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(广水市教育局封闭管理项目二次).pdf"
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
||||||
# files = truncate_pdf_multiple(input_path, output_folder)
|
# files = truncate_pdf_multiple(input_path, output_folder)
|
||||||
# files=truncate_pdf_specific_goods(input_path,output_folder)
|
selections = [4, 5]
|
||||||
# print(files)
|
files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||||
selection = 6# 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-公告
|
print(files)
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
# selection = 6# 例如:1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-公告
|
||||||
print(generated_files)
|
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
|
# print(generated_files)
|
Loading…
x
Reference in New Issue
Block a user