This commit is contained in:
zy123 2024-10-31 20:12:08 +08:00
parent 7d3bca17b3
commit 58328c2f22
5 changed files with 296 additions and 233 deletions

View File

@ -69,4 +69,227 @@ def find_and_merge(target_path, output_suffix):
paths=[target_path]
else:
paths=[target_path,full_path]
merge_pdfs(paths,target_path)
merge_pdfs(paths,target_path)
def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_path, base_file_name):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
以及 truncate_files 中以指定后缀结尾的文件按照指定顺序合并
参数
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
返回:
- str: 合并后的 PDF 文件路径如果未找到所有需要合并的文件则返回 ""
"""
# 1. 获取 output_folder 中所有文件
try:
all_output_files = os.listdir(output_folder)
except FileNotFoundError:
print(f"输出文件夹 '{output_folder}' 未找到。")
return ""
except PermissionError:
print(f"没有权限访问输出文件夹 '{output_folder}'")
return ""
# 2. 定义要选择的文件后缀及合并顺序,包括 before 文件
desired_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_table.pdf'
]
all_pdfs_to_merge = []
missing_files = [] # 用于记录缺失的文件
for suffix in desired_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"没有找到以 '{suffix}' 结尾的文件。")
missing_files.append(suffix) # 记录缺失的文件
# 检查是否所有需要的文件都已找到
if missing_files:
print("缺少以下必要的 PDF 文件,无法进行合并:")
for missing in missing_files:
print(f" - {missing}")
return ""
# 过滤掉不存在或为空的文件路径
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)]
if not all_pdfs_to_merge:
print("没有找到要合并的 PDF 文件。")
return ""
# 调用 merge_pdfs 函数进行合并
try:
merge_pdfs(all_pdfs_to_merge, output_path)
print(f"已成功合并 PDF 文件到 '{output_path}'")
except Exception as e:
print(f"合并 PDF 文件时出错: {e}")
return ""
# 检查合并后的文件是否存在且不为空
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
# 合并成功,删除 {base_file_name}_before.pdf 文件
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
if os.path.exists(before_pdf_path):
try:
os.remove(before_pdf_path)
print(f"已删除文件: {before_pdf_path}")
except Exception as e:
print(f"删除文件 {before_pdf_path} 时出错: {e}")
else:
print(f"未找到要删除的文件: {before_pdf_path}")
return output_path
else:
print(f"合并失败,没有生成 '{output_path}'")
return ""
#合并封面+招标公告+投标人须知前附表+须知正文
def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, base_file_name):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
以及 truncate_files 中以指定后缀结尾的文件按照指定顺序合并
参数
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
返回:
- str: 如果合并成功返回 output_path否则返回空字符串 ""
"""
# 1. 获取 output_folder 中所有文件
try:
all_output_files = os.listdir(output_folder)
except FileNotFoundError:
print(f"输出文件夹 '{output_folder}' 未找到。")
return ""
except PermissionError:
print(f"没有权限访问输出文件夹 '{output_folder}'")
return ""
# 2. 定义要选择的文件后缀及合并顺序
required_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_part1.pdf'
]
optional_suffixes = [
f'{base_file_name}_tobidders_notice_part2.pdf'
]
all_pdfs_to_merge = []
missing_files = [] # 用于记录缺失的文件
# 3. 处理必需的文件
for suffix in required_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"没有找到以 '{suffix}' 结尾的文件。")
missing_files.append(suffix) # 记录缺失的文件
# 4. 处理可选的文件
for suffix in optional_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"可选文件 '{suffix}' 未找到,继续合并。")
# 5. 检查是否所有必需的文件都已找到
if missing_files:
print("缺少以下必要的 PDF 文件,无法进行合并:")
for missing in missing_files:
print(f" - {missing}")
return ""
print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
# 6. 过滤掉不存在或为空的文件路径
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0]
if not all_pdfs_to_merge:
print("没有找到要合并的有效 PDF 文件。")
return ""
# 7. 调用 merge_pdfs 函数进行合并
try:
merge_pdfs(all_pdfs_to_merge, output_path)
print(f"已成功合并 PDF 文件到 '{output_path}'")
except Exception as e:
print(f"合并 PDF 文件时出错: {e}")
return ""
# 8. 检查合并后的文件是否存在且不为空
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
# 合并成功,删除 {base_file_name}_before.pdf 文件
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
if os.path.exists(before_pdf_path):
try:
os.remove(before_pdf_path)
print(f"已删除文件: {before_pdf_path}")
except Exception as e:
print(f"删除文件 {before_pdf_path} 时出错: {e}")
else:
print(f"未找到要删除的文件: {before_pdf_path}")
return output_path
else:
print(f"合并失败,没有生成 '{output_path}'")
return ""

View File

@ -69,6 +69,7 @@ def extract_text_by_page(file_path):
page = reader.pages[page_num]
text = page.extract_text()
if text:
print("-------------------")
cleaned_text = clean_page_content(text,common_header)
print(cleaned_text)
result += cleaned_text
@ -149,7 +150,7 @@ def extract_text_by_page(file_path):
if __name__ == '__main__':
file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest18.pdf"
file_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\HBDL-2024-0181-001-招标文件.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'

View File

@ -321,6 +321,7 @@ def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, claus
updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list)
return updated_json
#TODO:现在是clause truncate_jsonpath非空就尝试从中提取但是有可能从里面提取失败这个时候可能需要重新问大模型。
if __name__ == "__main__":
start_time=time.time()
# knowledge_name="zbfile"

View File

@ -3,7 +3,7 @@ import os
import time
from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.merge_pdfs import merge_pdfs
from flask_app.general.merge_pdfs import merge_pdfs, merge_selected_pdfs_for_engineering
import concurrent.futures
import logging
@ -122,7 +122,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en
print(f"Error in save_pages_to_new_pdf: {e}")
return "" # 返回空字符串
def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_page,common_header):
def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_page,common_header,is_secondary_match):
pdf_document = PdfReader(pdf_path)
exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据')
def run_extraction():
@ -152,16 +152,19 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_
if chapter_type:
# 根据 chapter_type 动态生成 end_pattern
end_pattern = re.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|' # Match chapter pattern
r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line
r'^附录(?:一)?[:]|' # Match "附录一" with optional "一" and colon
r'^附件(?:一)?[:]|' # Match "附件一" with optional "一" and colon
r'^附表(?:一)?[:]', # Match "附表一" with optional "一" and colon
re.MULTILINE
)
# print(f"动态生成的 end_pattern: {end_pattern.pattern}") # 打印生成的 end_pattern
if not is_secondary_match:
end_pattern = re.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|'
r'^附录(?:一)?[:]|'
r'^附件(?:一)?[:]|'
r'^附表(?:一)?[:]',
re.MULTILINE
)
else:
end_pattern = re.compile(
rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$'
)
# 根据 chapter_type 动态生成 additional_mid_pattern
if chapter_type == '':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
@ -189,16 +192,20 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_
# print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern
else:
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line
r'^附录(?:一)?[:]|' # Match "附录一" with optional "一" and colon
r'^附件(?:一)?[:]|' # Match "附件一" with optional "一" and colon
r'^附表(?:一)?[:]', # Match "附表一" with optional "一" and colon
re.MULTILINE
)
if not is_secondary_match:
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line
r'^附录(?:一)?[:]|' # Match "附录一" with optional "一" and colon
r'^附件(?:一)?[:]|' # Match "附件一" with optional "一" and colon
r'^附表(?:一)?[:]', # Match "附表一" with optional "一" and colon
re.MULTILINE
)
else:
end_pattern = re.compile(
rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+\s*$'
)
# print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
combined_mid_pattern = re.compile(
@ -212,14 +219,9 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_
mid_page = i
if start_page is not None and mid_page is not None and chapter_type:
if re.search(end_pattern, cleaned_text):
if mid_page is None:
if i > start_page:
end_page = i
break
else:
if i > mid_page:
end_page = i
break
if i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
@ -248,11 +250,10 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
if text:
cleaned_text = clean_page_content(text, common_header)
if re.search(begin_pattern, cleaned_text) and i >= begin_page:
if not start_page:
if (output_suffix == "notice" or output_suffix == "invalid") and is_secondary_match:
pass # 针对 '同招标公告' 的情况
elif output_suffix not in ["notice", "invalid"] or not is_secondary_match:
start_page = i
if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
pass
else:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text):
condition = i > start_page
if condition:
@ -332,7 +333,14 @@ def truncate_pdf_main(input_path, output_folder, selection):
re.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)'),
re.compile(
r'第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
re.MULTILINE)
),
(
re.compile(
r'.*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', re.MULTILINE),
re.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
re.MULTILINE)
)
]
@ -341,21 +349,28 @@ def truncate_pdf_main(input_path, output_folder, selection):
# Selection 2: 评标办法
pattern_pairs = [
(
re.compile(r'第[一二三四五六七八九十]+\s*评标办法'), # Alternative begin pattern
re.compile(r'第[一二三四五六七八九十]+(?:|部分)\s*评标办法'), # Alternative begin pattern
re.compile(r'评标办法正文|评标办法') # Alternative end pattern
),
]
output_suffix = "evaluation_method"
#TODO:exclusion
elif selection == 3:
# Selection 3: 资格审查条件
pattern_pairs = [
(
re.compile(r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$',
re.compile(r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*',
re.MULTILINE),
re.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*$'
r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
),
(
re.compile(
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]*资格[\u4e00-\u9fff]*\s*$',re.MULTILINE),
re.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*|'
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$',re.MULTILINE)
)
]
output_suffix = "qualification"
@ -383,7 +398,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE)
),
(
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷|投标邀请)\s*$', re.MULTILINE),
re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\)]?\s*$', re.MULTILINE),
re.compile(r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷', re.MULTILINE)
)
]
@ -404,7 +419,7 @@ def truncate_pdf_main(input_path, output_folder, selection):
}.get(selection, 0)
if selection == 1: #投标人须知
output_paths = list(
extract_pages_tobidders_notice(input_path,output_folder, begin_pattern, begin_page, common_header))
extract_pages_tobidders_notice(input_path,output_folder, begin_pattern, begin_page, common_header,is_secondary_match))
if output_paths and any(os.path.isfile(f) for f in output_paths):
return output_paths
else:
@ -495,93 +510,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header,las
return []
def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
以及 truncate_files 中以指定后缀结尾的文件按照指定顺序合并
参数
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
返回:
- str: 合并后的 PDF 文件路径如果没有合并则返回 ""
"""
# 1. 获取 output_folder 中所有文件
try:
all_output_files = os.listdir(output_folder)
except FileNotFoundError:
print(f"输出文件夹 '{output_folder}' 未找到。")
return ""
except PermissionError:
print(f"没有权限访问输出文件夹 '{output_folder}'")
return ""
# 2. 定义要选择的文件后缀及合并顺序,包括 before 文件
desired_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_table.pdf'
]
all_pdfs_to_merge = []
for suffix in desired_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"没有找到以 '{suffix}' 结尾的文件。")
# 过滤掉不存在或为空的文件路径
all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)]
if not all_pdfs_to_merge:
print("没有找到要合并的 PDF 文件。")
return ""
# 调用 merge_pdfs 函数进行合并
try:
merge_pdfs(all_pdfs_to_merge, output_path)
print(f"已成功合并 PDF 文件到 '{output_path}'")
except Exception as e:
print(f"合并 PDF 文件时出错: {e}")
return ""
# 检查合并后的文件是否存在且不为空
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
# 合并成功,删除 {base_file_name}_before.pdf 文件
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
if os.path.exists(before_pdf_path):
try:
os.remove(before_pdf_path)
print(f"已删除文件: {before_pdf_path}")
except Exception as e:
print(f"删除文件 {before_pdf_path} 时出错: {e}")
else:
print(f"未找到要删除的文件: {before_pdf_path}")
return output_path
else:
print(f"合并失败,没有生成 '{output_path}'")
return ""
def truncate_pdf_multiple(input_path, output_folder, unique_id="123"):
global logger
logger = get_global_logger(unique_id)
@ -615,7 +543,7 @@ def truncate_pdf_multiple(input_path, output_folder, unique_id="123"):
# Proceed with merging logic as before...
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_result:
truncate_files.append(merged_result)
logger.info(f"merged_baseinfo: 已生成合并文件: {merged_output_path}")
@ -662,7 +590,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
# Proceed with merging logic as before...
if any(f for f in truncate_files if f): # 检查是否有有效的文件路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_result:
truncate_files.append(merged_result)
logger.info(f"merged_specific: 已生成合并文件: {merged_output_path}")
@ -689,15 +617,15 @@ if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\HBDL-2024-0181-001-招标文件.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
# input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\tmp"
# files=truncate_pdf_multiple(input_path,output_folder)
# selections = [5, 1] # 仅处理 selection 5、1
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
# print(files)
selection = 5 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
selection = 3 # 例如1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print("生成的文件:", generated_files)
# print("生成的文件:", generated_files)
end_time=time.time()
print("耗时:"+str(end_time-start_time))

View File

@ -4,7 +4,7 @@ from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.general.format_change import docx2pdf
from flask_app.general.merge_pdfs import merge_and_cleanup,merge_pdfs
from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods
import concurrent.futures
def get_global_logger(unique_id):
if unique_id is None:
@ -367,14 +367,9 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm
mid_page = i
if start_page is not None and mid_page is not None and chapter_type:
if re.search(end_pattern, cleaned_text):
if mid_page is None:
if i > start_page:
end_page = i
break
else:
if i > mid_page:
end_page = i
break
if i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
@ -536,91 +531,6 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
print(f"Error in save_extracted_pages: {e}")
return "" # 返回空字符串
#合并封面+招标公告+投标人须知前附表+须知正文
def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name):
"""
合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件
以及 truncate_files 中以指定后缀结尾的文件按照指定顺序合并
参数
- output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径
- truncate_files (list): 包含 PDF 文件路径的列表
- output_path (str): 合并后的 PDF 文件保存路径
- base_file_name (str): 用于匹配文件名的基础名称
返回:
- str: 如果合并成功返回 output_path否则返回空字符串 ""
"""
# 1. 获取 output_folder 中所有文件
try:
all_output_files = os.listdir(output_folder)
except FileNotFoundError:
print(f"输出文件夹 '{output_folder}' 未找到。")
return ""
except PermissionError:
print(f"没有权限访问输出文件夹 '{output_folder}'")
return ""
# 2. 定义要选择的文件后缀及合并顺序,包括 before 文件
desired_suffixes = [
f'{base_file_name}_before.pdf',
f'{base_file_name}_notice.pdf',
f'{base_file_name}_tobidders_notice_part1.pdf',
f'{base_file_name}_tobidders_notice_part2.pdf'
]
all_pdfs_to_merge = []
for suffix in desired_suffixes:
if suffix == f'{base_file_name}_before.pdf':
# 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
matching_files = [
os.path.join(output_folder, f)
for f in all_output_files
if f.endswith(suffix)
]
else:
# 从 truncate_files 中选择以指定后缀结尾的文件
matching_files = [f for f in truncate_files if f.endswith(suffix)]
if matching_files:
# 如果找到多个匹配的文件,按名称排序并添加
matching_files_sorted = sorted(matching_files)
all_pdfs_to_merge.extend(matching_files_sorted)
for f in matching_files_sorted:
print(f"选中文件: {f}")
else:
print(f"没有找到以 '{suffix}' 结尾的文件。")
print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
if not all_pdfs_to_merge:
print("没有找到要合并的 PDF 文件。")
return ""
# 调用 merge_pdfs 函数进行合并
merge_pdfs(all_pdfs_to_merge, output_path)
print(f"已成功合并 PDF 文件到 '{output_path}'")
# 检查合并后的文件是否存在且不为空
if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
# 合并成功,删除 {base_file_name}_before.pdf 文件
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
if os.path.exists(before_pdf_path):
try:
os.remove(before_pdf_path)
print(f"已删除文件: {before_pdf_path}")
except Exception as e:
print(f"删除文件 {before_pdf_path} 时出错: {e}")
else:
print(f"未找到要删除的文件: {before_pdf_path}")
return output_path
else:
print(f"合并失败,没有生成 '{output_path}'")
return ""
def get_start_and_common_header(input_path):
common_header = extract_common_header(input_path)
last_begin_index = 0
@ -756,7 +666,7 @@ def truncate_pdf_multiple(pdf_path, output_folder,unique_id="123"):
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
@ -806,7 +716,7 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
# 调用 merge_selected_pdfs 并获取返回值
merged_path = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
if merged_path:
# 合并成功,添加合并后的文件路径
truncate_files.append(merged_path)
@ -819,9 +729,9 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
return truncate_files
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况类似工程标
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况类似工程标 唐山投标只有正文,没有附表
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1).pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
@ -832,6 +742,6 @@ if __name__ == "__main__":
# selections = [1,4]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
print(files)
# selection = 1# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
# selection = 4# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)