12.18 截取pdf

This commit is contained in:
zy123 2024-12-18 16:01:32 +08:00
parent b9c1201ec8
commit ccfbe522f8
8 changed files with 233 additions and 114 deletions

View File

@ -24,36 +24,33 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
返回:
- list: 截取和合并后的文件路径列表如果截取或合并失败则包含空字符串
"""
if selections is None:
selections = [1, 2, 3, 4, 5]
# 确认模式有效
if mode not in ['goods', 'engineering']:
raise ValueError("mode 参数必须是 'goods''engineering'")
def handle_exception(selection):
return ["", ""] if selection == 4 else [""]
# 设置模式相关的参数和函数
if mode == 'goods':
logger.info("call 货物标截取pdf")
truncate_function = truncate_pdf_main_goods
merge_mode = 'goods'
modes_config = {
"goods": {"selections": [1, 2, 3, 4, 5, 6], "truncate_func": truncate_pdf_main_goods},
"engineering": {"selections": [1, 2, 3, 4, 5], "truncate_func": truncate_pdf_main_engineering},
}
# 根据 'goods' 模式定义异常处理的逻辑
else: # mode == 'engineering'
logger.info("call 工程标标截取pdf")
truncate_function = truncate_pdf_main_engineering
merge_mode = 'engineering'
# 验证 mode 是否有效
if mode not in modes_config:
raise ValueError("mode 参数必须是 'goods''engineering'")
num_selections = len(selections)
res = check_pdf_pages(pdf_path, logger)
if res is not None:
return res # 返回包含空字符串的列表
# 初始化 mode 参数
config = modes_config[mode]
truncate_function = config["truncate_func"]
selections = selections or config["selections"]
# 检查 PDF 页数逻辑
skip, empty_return = check_pdf_pages(pdf_path, mode, logger)
if skip:
return empty_return
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名
truncate_files = []
# 使用 ThreadPoolExecutor 进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
# 提交所有任务并保持 selection 顺序
future_to_selection = {
selection: executor.submit(
@ -73,14 +70,17 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
future = future_to_selection.get(selection)
try:
files = future.result()
if files:
# 扁平化返回的结果
if isinstance(files, list):
truncate_files.extend(files)
# 无需额外处理,因为 `truncate_function` 已处理空字符串的添加和日志记录
elif isinstance(files, str): # 如果返回单个字符串,直接添加
truncate_files.append(files)
else:
logger.warning(f"未知的返回类型: {type(files)},跳过该结果")
except Exception as e:
logger.error(f"Selection {selection} 生成了一个异常: {e}")
# 根据模式和 selection 添加相应数量的空字符串
truncate_files.extend(handle_exception(selection))
# 定义合并后的输出路径
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
@ -90,7 +90,7 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
truncate_files,
merged_output_path,
base_file_name,
mode=merge_mode
mode=mode
)
if merged_path:
@ -109,15 +109,17 @@ if __name__ == "__main__":
logger=get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
# pdf_path=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件(实高电子显示屏).pdf'
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
# selections = [1, 4] # 仅处理 selection 4、1
# selections=[5]
selections=[6]
#engineering
files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods')
# files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections)
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
print(files)
# selection = 1 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)

View File

@ -7,6 +7,7 @@ from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
from flask_app.general.format_change import docx2pdf
import concurrent.futures
def get_start_and_common_header(input_path,end_page):
@ -34,20 +35,26 @@ def get_start_and_common_header(input_path,end_page):
return common_header, last_begin_index
return common_header, last_begin_index
def check_pdf_pages(pdf_path, logger):
def check_pdf_pages(pdf_path,mode, logger):
try:
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
if num_pages <= 50:
logger.info("PDF页数小于或等于50页跳过切分逻辑。")
return ['', '', '', '', '', '', '']
if mode=='goods':
return True,['', '', '', '', '', '', '','']
else:
return True,['', '', '', '', '', '', '']
# 若页数大于50页返回None表示继续处理
return None
return False, []
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
# 返回空列表意味着无法执行后续处理逻辑
return ['', '', '', '', '', '', '']
if mode == 'goods':
return True,['', '', '', '', '', '', '', '']
else:
return True,['', '', '', '', '', '', '']
def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header):
@ -108,22 +115,151 @@ def convert_to_pdf(file_path):
return docx2pdf(file_path)
return file_path
def get_invalid_file(file_path,output_folder,common_header):
pdf_document = PdfReader(file_path)
total_pages = len(pdf_document.pages)
begin_pattern=[regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',regex.MULTILINE
),
def get_invalid_file(file_path, output_folder, common_header,begin_page):
"""
从指定的PDF文件中提取无效部分并保存到输出文件夹中
begin_pattern从前往后匹配 end_pattern从后往前
页数小于100不进行begin_pattern默认start_page=0
页数大于200时end_pattern从前往后匹配
Args:
file_path (str): 输入的PDF文件路径
output_folder (str): 提取后文件的输出文件夹路径
common_header (str): 公共头部文本用于清理每页的内容
Returns:
list: 包含保存的文件路径的列表如果提取失败则返回包含空字符串的列表
"""
# 定义开始模式
begin_patterns = [
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)]
end_pattern=[regex.compile(
r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
regex.MULTILINE
),
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', regex.MULTILINE),
regex.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*"
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
regex.MULTILINE
),
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
]
# 定义结束模式
end_patterns = [
regex.compile(
r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告',
regex.MULTILINE
),
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
regex.MULTILINE
),
regex.compile(
r"\s*(投标文件|响应文件|响应性文件)(?:的)?格式\s*",
regex.MULTILINE
)
]
# 定义排除模式
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成',
regex.MULTILINE
)
try:
# 打开PDF文件
pdf_document = PdfReader(file_path)
total_pages = len(pdf_document.pages)
# print(f"PDF总页数: {total_pages}")
# 提取并清理每页的文本内容
page_texts = []
for i in range(total_pages):
page = pdf_document.pages[i]
text = page.extract_text()
cleaned_text = clean_page_content(text, common_header) if text else ""
page_texts.append(cleaned_text)
# 定义查找起始页的函数
def find_start_page(begin_page):
for pattern in begin_patterns:
for i in range(begin_page, min(begin_page + 30, total_pages)):
text = page_texts[i]
if regex.search(exclusion_pattern, text):
continue
if regex.search(pattern, text):
# print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
# print(f"未在前30页找到模式: {pattern.pattern}")
return 0 # 默认从第一页开始
# 定义查找结束页的函数
def find_end_page():
if total_pages > 200:
# print("总页数大于200页结束页从前往后查找跳过前30页。")
for pattern in end_patterns:
for i in range(30, total_pages):
text = page_texts[i]
if regex.search(pattern, text):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
else:
# print("结束页从后往前查找确保只剩前30页时停止。")
for pattern in end_patterns:
for i in range(total_pages - 1, 29, -1):
text = page_texts[i]
if regex.search(pattern, text):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
# 如果没有匹配到设置end_page逻辑
if total_pages > 100:
print("未找到结束模式总页数大于100设置结束页为前100页。"+file_path)
return max(100, int(total_pages * 2 / 3))
else:
print("未找到结束模式,设置结束页为最后一页。"+file_path)
return total_pages - 1
# 根据总页数决定是否查找起始页
if total_pages < 100:
# print("总页数少于100页默认起始页为第一页。")
start_page = 0
with concurrent.futures.ThreadPoolExecutor() as executor:
future_end = executor.submit(find_end_page)
end_page = future_end.result()
else:
with concurrent.futures.ThreadPoolExecutor() as executor:
future_start = executor.submit(find_start_page,begin_page)
future_end = executor.submit(find_end_page)
start_page = future_start.result()
end_page = future_end.result()
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
# 验证页码范围
if start_page > end_page:
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
return [""]
# 调用已实现的保存函数
output_path = save_extracted_pages(
pdf_path=file_path,
output_folder=output_folder,
start_page=start_page,
end_page=end_page,
output_suffix="invalid",
common_header=common_header
)
# print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
return output_path
except Exception as e:
print(f"处理文件 {file_path} 时发生错误: {e}")
return ""
if __name__ == "__main__":
file_path=r'C:\Users\Administrator\Desktop\new招标文件\货物标\HBDL-2024-0310-001-招标文件.pdf'
output_folder=r'C:\Users\Administrator\Desktop\new招标文件\货物标\tmp'
res=get_invalid_file(file_path,output_folder,"")

View File

@ -54,7 +54,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
tobidders_notice_table = truncate_files[3] #投标人须知前附表
tobidders_notice=truncate_files[4] #投标人须知正文
invalid_path=truncate_files[5] if truncate_files[5]!="" else pdf_path #无效标
invalid_path=truncate_files[5] if truncate_files[5] != "" else pdf_path #无效标
# invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
# invalid_docpath=pdf2docx(invalid_path)
invalid_added_pdf = insert_mark(invalid_path)

View File

@ -43,7 +43,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
# 处理各个部分
invalid_path=pdf_path
invalid_path=truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标
invalid_added_pdf = insert_mark(invalid_path)
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path
@ -62,7 +62,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
qualification_path = truncate_files[2] # 资格审查
tobidders_notice_path = truncate_files[4] # 投标人须知正文
notice_path = truncate_files[0] #招标公告
merged_baseinfo_path = truncate_files[6] # 合并封面+招标公告+投标人须知前附表+须知正文
merged_baseinfo_path = truncate_files[7] # 合并封面+招标公告+投标人须知前附表+须知正文
clause_path = convert_clause_to_json(tobidders_notice_path, output_folder) # 投标人须知正文条款pdf->json
end_time = time.time()

View File

@ -1,10 +1,9 @@
import regex
begin_pattern = regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$',
regex.MULTILINE
)
r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
regex.MULTILINE
)
# 测试示例

View File

@ -220,14 +220,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
return []
else:
return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
elif output_suffix == "invalid":
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
# 计算总页数的三分之二
total = int(total_pages * 2 / 3)
start_page = last_begin_index
end_page = min(90, total)
return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
else:
print(f"{output_suffix} twice: 未定义的输出后缀。")
return []
@ -346,29 +338,35 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
]
output_suffix = "tobidders_notice"
elif selection == 5:
# Selection 5: 无效标
pattern_pairs = [
(
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
),
regex.compile(
r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
regex.MULTILINE
)
),
(
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
),
regex.compile(
r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
regex.MULTILINE
)
)
]
output_suffix = "invalid"
begin_page=last_begin_index
invalid_path = get_invalid_file(file_path,output_folder,common_header,begin_page)
if invalid_path:
return [invalid_path]
else:
return [""]
# # Selection 5: 无效标
# pattern_pairs = [
# (
# regex.compile(
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
# ),
# regex.compile(
# r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
# regex.MULTILINE
# )
# ),
# (
# regex.compile(
# r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
# regex.MULTILINE
# ),
# regex.compile(
# r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
# regex.MULTILINE
# )
# )
# ]
# output_suffix = "invalid"
else:
print("无效的选择:请选择1-5")
return [""]
@ -383,21 +381,6 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
5: 0 # 无效标
}.get(selection, 0)
if selection == 5: #无效标
invalid_path = list(
get_invalid_file(
file_path,
output_folder,
begin_pattern,
common_header,
)
)
if invalid_path:
return invalid_path
else:
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
continue
if selection == 4: # 投标人须知
output_paths = list(
extract_pages_tobidders_notice(
@ -474,14 +457,14 @@ if __name__ == "__main__":
logger=get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4_evaluation_method.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
# selection = 1 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
# print(generated_files)
output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp"
selection = 5 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()
print("耗时:" + str(end_time - start_time))

View File

@ -4,7 +4,7 @@ import os # 用于文件和文件夹操作
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
from flask_app.general.merge_pdfs import merge_and_cleanup
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
convert_to_pdf
convert_to_pdf, get_invalid_file
from flask_app.general.通用功能函数 import get_global_logger
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
@ -473,6 +473,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
os.makedirs(output_folder)
# 获取起始和通用页眉
pdf_path = convert_to_pdf(input_path)
common_header, last_begin_index = get_start_and_common_header(input_path, 10)
begin_page = last_begin_index if last_begin_index != 0 else {
4: 1,
@ -529,13 +530,12 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
)
local_output_suffix = "procurement"
elif selection == 6:
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*'
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE
)
local_output_suffix = "format"
begin_page = last_begin_index
invalid_path = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
if invalid_path:
return [invalid_path]
else:
return [""]
else:
print("无效的选择:请选择1-6")
return ['']
@ -545,7 +545,6 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
output_suffix = local_output_suffix
# 将原先的 process_files 逻辑合并到此处
pdf_path = convert_to_pdf(input_path)
result = extract_pages(
pdf_path,
output_folder,