2024-12-09 17:38:01 +08:00

606 lines
32 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import os
import time
from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
from flask_app.general.merge_pdfs import merge_selected_pdfs_for_engineering
import concurrent.futures
import logging
def get_global_logger(unique_id):
if unique_id is None:
return logging.getLogger() # 获取默认的日志器
logger = logging.getLogger(unique_id)
return logger
logger = None
def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header):
try:
# 获取文件基本名称
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
# 构建主要输出文件路径
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
# 读取PDF文件
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
# 检查起始和结束页码是否有效
if start_page < 0 or end_page >= total_pages or start_page > end_page:
print(f"无效的页面范围: {start_page}{end_page}")
return ""
# 如果 output_suffix 是 'notice',保存 start_page 之前的页面(若遇到'目录',则截取到'目录')
if output_suffix == 'notice' and start_page > 0:
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
before_doc = PdfWriter()
toc_page = -1
# 查找目录页
for page_num in range(min(start_page, total_pages)):
page_text = pdf_document.pages[page_num].extract_text()
cleaned_text = clean_page_content(page_text, common_header)
if re.search(r'\s*录', cleaned_text, re.MULTILINE):
toc_page = page_num
break
# 确定截取的页数
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
# 提取页面
for page_num in range(pages_to_extract):
before_doc.add_page(pdf_document.pages[page_num])
print(before_pdf_path)
with open(before_pdf_path, 'wb') as f_before:
before_doc.write(f_before)
# print(f"已保存页面从 0 到 {pages_to_extract} 为 {before_pdf_path}")
# 提取指定范围的页面
output_doc = PdfWriter()
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
# 保存新的PDF文件
with open(output_pdf_path, 'wb') as f_output:
output_doc.write(f_output)
print(f"{output_suffix} 已截取并保存页面从 {start_page + 1}{end_page + 1}{output_pdf_path}")
return output_pdf_path
except Exception as e:
print(f"Error in save_pages_to_new_pdf: {e}")
return "" # 返回空字符串
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
is_secondary_match):
pdf_document = PdfReader(pdf_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
def run_extraction():
start_page = None
mid_page = None
end_page = None
chapter_type = None # 用于存储“章”或“部分”
for i, page in enumerate(pdf_document.pages):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text) and mid_page is not None:
continue
if start_page is None:
match = re.search(begin_pattern, cleaned_text)
if match and i > begin_page:
start_page = i
matched_text = match.group(0) # 获取整个匹配的文本
if '' in matched_text:
chapter_type = ''
elif '部分' in matched_text:
chapter_type = '部分'
else:
chapter_type = None # 未匹配到“章”或“部分”
if chapter_type:
# 根据 chapter_type 动态生成 end_pattern
if not is_secondary_match:
end_pattern = re.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|'
r'^附录(?:一)?[:]|'
r'^附件(?:一)?[:]|'
r'^附表(?:一)?[:]',
re.MULTILINE
)
else:
end_pattern = re.compile(
rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$'
)
# 根据 chapter_type 动态生成 additional_mid_pattern
if chapter_type == '':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
elif chapter_type == '部分':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
else:
additional_mid_pattern = ''
# 定义基础的 mid_pattern
base_mid_pattern = (
r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
r'|^((投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文部分)'
)
# 合并基础模式和额外模式
if additional_mid_pattern:
combined_mid_pattern = re.compile(
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
re.MULTILINE
)
else:
combined_mid_pattern = re.compile(
rf'{base_mid_pattern}',
re.MULTILINE
)
# print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern
else:
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
if not is_secondary_match:
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line
r'^附录(?:一)?[:]|' # Match "附录一" with optional "一" and colon
r'^附件(?:一)?[:]|' # Match "附件一" with optional "一" and colon
r'^附表(?:一)?[:]', # Match "附表一" with optional "一" and colon
re.MULTILINE
)
else:
end_pattern = re.compile(
rf'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$'
)
# print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
combined_mid_pattern = re.compile(
rf'{base_mid_pattern}',
re.MULTILINE
)
continue
if start_page is not None and mid_page is None and combined_mid_pattern:
if re.search(combined_mid_pattern, cleaned_text):
mid_page = i
if start_page is not None and mid_page is not None and chapter_type:
if re.search(end_pattern, cleaned_text):
if i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
# 运行提取
start_page, mid_page, end_page = run_extraction()
if start_page is None or end_page is None or mid_page is None:
print(f"first: tobidders_notice 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return "", ""
path1 = save_pages_to_new_pdf(pdf_path, output_folder, "tobidders_notice_table", start_page, mid_page,
common_header)
path2 = save_pages_to_new_pdf(pdf_path, output_folder, "tobidders_notice", mid_page, end_page, common_header)
return path1, path2
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix, common_header,
is_secondary_match=False):
# 打开PDF文件
pdf_document = PdfReader(pdf_path)
start_page = None
end_page = None
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
# 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# if is_secondary_match and re.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成"
if re.search(exclusion_pattern, cleaned_text):
continue
if re.search(begin_pattern, cleaned_text) and i >= begin_page:
if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
pass
else:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and not re.search(begin_pattern,cleaned_text):
condition = i > start_page
if condition:
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
if is_invalid_condition or output_suffix != "invalid":
end_page = i
break # 找到结束页后退出循环
# 移除对 extract_pages_twice 的调用
if start_page is None or end_page is None:
print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!")
return [""]
else:
return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)]
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
"""
处理 PDF 文件,作为 truncate_pdf_main 的后备方法。
此函数将在所有模式对都失败后调用。
"""
try:
pdf_document = PdfReader(pdf_path)
if output_suffix == "qualification":
print("twice:qualificaiton!")
# 动态设置 include_keys
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查","能力","信誉"]
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
end_pattern_attachment = re.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:]).*$',
re.MULTILINE
)
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
end_pattern_chapter = re.compile(
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'\s*评标(?:办法|方法)前附表\s*$|'
r'投标人须知',
re.MULTILINE
)
start_page = None
end_page = None
# 从指定的开始页开始遍历
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 确定起始页需在last_begin_index之后
if any(key in cleaned_text for key in include_keys):
if re.search(begin_pattern, cleaned_text, re.MULTILINE):
if start_page is None:
start_page = i # 确保起始页不小于章节的开始页码
continue
# print(f"匹配到附录/附件/附表,设置起始页为: {start_page}")
# 确定结束页 - 附录、附件、附表等
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 额外检查当前页面是否不包含任何 include_keys
if not any(key in cleaned_text for key in include_keys):
if i > start_page:
end_page = i
# print(f"找到结束页 (附件类): {end_page}")
break # 找到结束页后退出循环
else:
print(f"当前页面匹配附件结束模式但包含 include_keys继续查找。页面: {i}")
# 确定结束页 - 章节标题、评标附表、投标人须知等
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
# print(f"找到结束页 (章节标题等): {end_page}")
break # 找到结束页后退出循环
if start_page is None or end_page is None:
print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return []
else:
return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)]
elif output_suffix == "invalid":
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
# 计算总页数的三分之二
total = int(total_pages * 2 / 3)
start_page = last_begin_index
end_page = min(90, total)
return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)]
else:
print(f"{output_suffix} twice: 未定义的输出后缀。")
return []
except Exception as e:
print(f"Error in extract_pages_twice: {e}")
return []
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = re.compile(
r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$',
re.MULTILINE
)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > 25:
return common_header, 0 # 如果页码大于25直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
def truncate_pdf_main(input_path, output_folder, selection):
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
if file_name.endswith('.pdf'):
file_path = os.path.join(input_path, file_name)
files = truncate_pdf_main(file_path, output_folder, selection)
if files:
generated_files.extend(files)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
# base_file_name = os.path.splitext(os.path.basename(input_path))[0]
common_header, last_begin_index = get_start_and_common_header(input_path)
# print(last_begin_index)
if selection == 1:
# Selection 1: 投标人须知前附表
pattern_pairs = [
(
re.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)'),
re.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
re.MULTILINE)
),
(
re.compile(
r'.*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表',
re.MULTILINE),
re.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
re.MULTILINE)
)
]
output_suffix = "tobidders_notice"
elif selection == 2:
# Selection 2: 评标办法
pattern_pairs = [
(
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'),
# Alternative begin pattern
re.compile(r'评标办法正文|评标方法正文|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
# Alternative end pattern
),
(
re.compile(
r'(?<!见)' # 确保前面不是“见”
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符(中文、顿号、括号)
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
r'[\u4e00-\u9fff、()]*\s*$' # 继续匹配允许的字符直到行尾
r'|\s*评标(办法|方法)前附表\s*$', # 或匹配“评标办法前附表”或“评标方法前附表”
re.MULTILINE
),
re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
)
]
output_suffix = "evaluation_method"
# TODO:exclusion
elif selection == 3:
# Selection 3: 资格审查条件
pattern_pairs = [
# (
# re.compile(r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格).*',
# re.MULTILINE),
# re.compile(
# r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*$'
# r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
# ),
(
re.compile(
r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]*资格[\u4e00-\u9fff、()]*\s*$',
re.MULTILINE),
re.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*|'
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', re.MULTILINE)
)
]
output_suffix = "qualification"
elif selection == 4:
# Selection 4: 招标公告
pattern_pairs = [
(
re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
),
(
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', re.MULTILINE),
re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE)
)
]
output_suffix = "notice"
elif selection == 5:
# Selection 5: 无效标
pattern_pairs = [
(
re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE)
),
(
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', re.MULTILINE),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE)
)
]
output_suffix = "invalid"
else:
print("无效的选择:请选择1-5")
return [""]
for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1):
is_secondary_match = (idx == 2)
begin_page = last_begin_index if last_begin_index != 0 else {
1: 3, # 前附表
2: 10, # 评标
3: 5, # 资格
4: 0, # 公告
5: 0 # 无效标
}.get(selection, 0)
if selection == 1: # 投标人须知
output_paths = list(
extract_pages_tobidders_notice(input_path, output_folder, begin_pattern, begin_page, common_header,
is_secondary_match))
if output_paths and any(os.path.isfile(f) for f in output_paths):
return output_paths
else:
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
continue
output_paths = extract_pages(
input_path,
output_folder,
begin_pattern,
begin_page=begin_page,
end_pattern=end_pattern,
output_suffix=output_suffix,
common_header=common_header,
is_secondary_match=is_secondary_match
)
if output_paths and any(os.path.isfile(f) for f in output_paths):
# print(f"Selection {selection}: 使用模式对 {idx} 成功提取。")
return output_paths # Return immediately upon successful extraction
else:
# print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
continue
# If all pattern pairs failed, attempt a fallback
print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。")
fallback_result = extract_pages_twice(input_path, output_folder, output_suffix, common_header, last_begin_index)
if fallback_result and any(os.path.isfile(f) for f in fallback_result):
return fallback_result
else:
if selection == 1:
return ["", ""]
else:
return ['']
else:
print("Invalid input path. It is neither a PDF file nor a directory containing PDF files.")
return ['']
def truncate_pdf_multiple(input_path, output_folder, unique_id="123"):
global logger
logger = get_global_logger(unique_id)
base_file_name = os.path.splitext(os.path.basename(input_path))[0] # 纯文件名
truncate_files = []
selections = range(1, 6) # 选择 1 到 5
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {selection: executor.submit(truncate_pdf_main, input_path, output_folder, selection) for
selection in selections}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files and any(f for f in files):
# 过滤空字符串
valid_files = [f for f in files if f]
truncate_files.extend(valid_files)
else:
logger.error(f"Selection {selection}: 截取失败,已添加空字符串。")
# Append two empty strings for selection=1; otherwise, one empty string
truncate_files.extend(["", ""] if selection == 1 else [""])
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# Append two empty strings for selection=1; otherwise, one empty string
truncate_files.extend(["", ""] if selection == 1 else [""])
# Proceed with merging logic as before...
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path,
base_file_name)
if merged_result:
truncate_files.append(merged_result)
# logger.info(f"merged_baseinfo: 已生成合并文件: {merged_output_path}")
else:
truncate_files.append("") # 如果 merged_result 未生成,添加空字符串
logger.warning("merged_baseinfo: 未生成合并文件,因为没有找到需要合并的 PDF 文件。")
else:
truncate_files.append("") # 如果没有文件需要合并,也添加空字符串
logger.warning(f"merged_baseinfo: 没有文件需要合并 for {input_path}")
return truncate_files
def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, unique_id="123"):
try:
global logger
logger = get_global_logger(unique_id)
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection) for
selection in selections}
# 按 selection 顺序收集结果
for selection in selections:
future = future_to_selection.get(selection)
try:
files = future.result()
if files and any(f for f in files):
# 过滤空字符串
valid_files = [f for f in files if f]
truncate_files.extend(valid_files)
else:
logger.error(f"Selection {selection}: 截取失败,已添加空字符串。")
# Append two empty strings for selection=1; otherwise, one empty string
truncate_files.extend(["", ""] if selection == 1 else [""])
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# Append two empty strings for selection=1; otherwise, one empty string
truncate_files.extend(["", ""] if selection == 1 else [""])
# Proceed with merging logic as before...
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path,
base_file_name)
if merged_result:
truncate_files.append(merged_result)
logger.info(f"merged_specific: 已生成合并文件: {merged_output_path}")
else:
truncate_files.append("") # 如果 merged_result 未生成,添加空字符串
logger.warning("merged_specific: 未生成合并文件,因为没有找到需要合并的 PDF 文件。")
else:
truncate_files.append("") # 如果没有文件需要合并,也添加空字符串
logger.warning(f"merged_specific: 没有文件需要合并 for {pdf_path}")
return truncate_files
except Exception as e:
logger.error(f"Error in truncate_pdf_specific_engineering: {e}")
return [""] * len(selections) # 返回与 selections 数量相同的空字符串列表
if __name__ == "__main__":
start_time = time.time()
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest20.pdf"
output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\all"
# files=truncate_pdf_multiple(input_path,output_folder)
# print(files)
# selections = [4, 1] # 仅处理 selection 4、1
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
selection = 2 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()
print("耗时:" + str(end_time - start_time))