zbparse/flask_app/工程标/截取pdf工程标版.py

266 lines
15 KiB
Python
Raw Permalink Normal View History

2025-01-03 17:36:23 +08:00
# flask_app/工程标/截取pdf工程标版.py
import fitz
2024-12-10 17:32:08 +08:00
import regex
2024-10-29 09:04:23 +08:00
import os
2024-10-31 15:03:32 +08:00
import time
from PyPDF2 import PdfReader
2025-02-16 18:09:45 +08:00
from flask_app.general.读取文件.clean_pdf import clean_page_content,create_get_text_function
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \
extract_pages_tobidders_notice, get_notice, extract_pages_generic
2024-12-17 18:44:58 +08:00
from flask_app.general.通用功能函数 import get_global_logger
2024-11-01 14:28:10 +08:00
2024-10-29 20:40:14 +08:00
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1):
try:
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix, invalid_endpage)
if start_page is not None and end_page is not None:
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
else:
return ""
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return ""
2024-11-01 14:28:10 +08:00
2025-02-16 18:09:45 +08:00
2024-11-01 14:28:10 +08:00
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
"""
处理 PDF 文件作为 truncate_pdf_main 的后备方法
2025-02-16 18:09:45 +08:00
当所有模式均失败后调用此方法
2024-11-01 14:28:10 +08:00
"""
2025-02-16 18:09:45 +08:00
if output_suffix != "qualification":
return ""
print("最后一次尝试获取qualification!")
# 动态设置关键字
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力", "信誉"]
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
end_pattern_attachment = regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:]).*$',
regex.MULTILINE
)
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
end_pattern_chapter = regex.compile(
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
def process_pdf(total_pages, get_text, include_keys, begin_pattern, end_pattern_attachment, end_pattern_chapter,
common_header, last_begin_index):
"""
遍历 PDF 的各页依据匹配规则确定起始页和结束页
"""
start_page = None
end_page = None
for i in range(last_begin_index, total_pages):
text = get_text(i)
if not text:
continue
cleaned_text = clean_page_content(text, common_header)
# 确定起始页:页面包含 include_keys 且匹配 begin_pattern
if start_page is None and any(key in cleaned_text for key in include_keys):
if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
start_page = i
continue
# 一旦起始页确定,则查找结束页
if start_page is not None:
# 结束模式:附件类
if end_pattern_attachment.search(cleaned_text):
# 如果页面不包含 include_keys 且页码大于起始页,则认为找到了结束页
if not any(key in cleaned_text for key in include_keys) and i > start_page:
end_page = i
break
else:
print(f"当前页面匹配附件结束模式但包含 include_keys继续查找。页面: {i}")
# 结束模式:章节标题等
elif end_pattern_chapter.search(cleaned_text) and i > start_page:
end_page = i
break
return start_page, end_page
# 尝试使用 PyPDF2 方式读取 PDF
2024-11-01 14:28:10 +08:00
try:
2025-02-16 18:09:45 +08:00
with open(pdf_path, "rb") as f:
pdf_document = PdfReader(f)
get_text = create_get_text_function('pypdf2', pdf_document)
total_pages = len(pdf_document.pages)
2025-02-16 18:09:45 +08:00
start_page, end_page = process_pdf(total_pages, get_text, include_keys, begin_pattern,
end_pattern_attachment, end_pattern_chapter,
common_header, last_begin_index)
except Exception as e_pypdf2:
print(f"extract_pages_twice: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 使用 PyMuPDF 作为备用方案
try:
with fitz.open(pdf_path) as pdf_document:
get_text = create_get_text_function('fitz', pdf_document)
total_pages = pdf_document.page_count
2025-02-16 18:09:45 +08:00
start_page, end_page = process_pdf(total_pages, get_text, include_keys, begin_pattern,
end_pattern_attachment, end_pattern_chapter,
common_header, last_begin_index)
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return ""
if start_page is None or end_page is None:
print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
2025-02-16 18:09:45 +08:00
else:
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
2024-10-29 20:40:14 +08:00
def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
2024-12-18 10:32:24 +08:00
try:
# 内嵌的处理单个文件的函数
def process_single_file(pdf_path, output_folder, selection):
2024-12-18 10:32:24 +08:00
try:
# 获取起始和通用页眉
common_header, last_begin_index = get_start_and_common_header(pdf_path, 20)
2025-02-14 11:07:21 +08:00
begin_page = last_begin_index if last_begin_index != -1 else {
1: 0, # 公告
2: 5, # 评标
3: 5, # 资格
4: 0, # 投标人须知
5: 0 # 无效投标
}.get(selection, 0)
# print(last_begin_index)
2024-12-18 10:32:24 +08:00
if selection == 1:
# 招标公告
path=get_notice(pdf_path,output_folder,begin_page,common_header,invalid_endpage)
return [path or ""]
2024-12-18 10:32:24 +08:00
elif selection == 2:
# Selection 2: 评标办法
pattern_pairs = [
(
regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
2024-12-18 10:32:24 +08:00
regex.MULTILINE
),
# Alternative end pattern
regex.compile(
r'^第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'[:]清标报告\s*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文(?:部分)?\s*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',
2024-12-18 10:32:24 +08:00
regex.MULTILINE
)
),
(
regex.compile(
r'(?:(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()]*?)'
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))' # 修改的部分
2024-12-18 10:32:24 +08:00
r'[\u4e00-\u9fff、()]*\s*$|'
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)(?:前附表)?\s*$)',
2025-01-03 17:36:23 +08:00
# 第二种模式
2024-12-18 10:32:24 +08:00
regex.MULTILINE
),
regex.compile(
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
2024-12-18 10:32:24 +08:00
regex.MULTILINE
)
)
]
output_suffix = "evaluation_method"
elif selection == 3:
# Selection 3: 资格审查条件
pattern_pairs = [
(
regex.compile(
r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|资格).*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*'
2024-12-18 10:32:24 +08:00
r'[\u4e00-\u9fff、()]*资格[\u4e00-\u9fff、()]*\s*$',
regex.MULTILINE
),
regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|资格)).*|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+',
2024-12-18 10:32:24 +08:00
regex.MULTILINE
)
)
]
output_suffix = "qualification"
elif selection == 4:
# 投标人须知
path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage)
return [path1 or "", path2 or ""]
2025-02-07 15:27:24 +08:00
elif selection == 5: #除去投标文件格式之前的内容 或者 合同条款之前的内容
invalid_path, end_page = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
return [invalid_path or "", end_page]
2024-12-18 10:32:24 +08:00
else:
print("无效的选择:请选择1-5")
return [""]
for begin_pattern, end_pattern in pattern_pairs:
result = extract_pages(
pdf_path,
2024-12-18 10:32:24 +08:00
output_folder,
begin_pattern,
begin_page=begin_page,
invalid_endpage=invalid_endpage,
2024-12-18 10:32:24 +08:00
end_pattern=end_pattern,
output_suffix=output_suffix,
common_header=common_header
2024-12-17 18:44:58 +08:00
)
if result:
return [result]
2024-12-18 10:32:24 +08:00
else:
# print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
continue
# 如果所有模式对都失败,尝试回退
print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。{pdf_path}")
2024-12-18 10:32:24 +08:00
fallback_result = extract_pages_twice(
pdf_path,
2024-12-18 10:32:24 +08:00
output_folder,
output_suffix,
common_header,
last_begin_index
2024-10-29 20:40:14 +08:00
)
return [fallback_result or ""]
2024-12-18 10:32:24 +08:00
except Exception as e:
logger.error(f"Error in processing file '{pdf_path}': {e}")
2024-12-18 10:32:24 +08:00
return [""] * (2 if selection == 4 else 1)
# 检查是否为文件夹
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
if file_name.endswith('.pdf'):
pdf_path = os.path.join(input_path, file_name)
files = process_single_file(pdf_path, output_folder, selection)
if isinstance(files, tuple):
generated_files.extend([f if f else "" for f in files])
else:
generated_files.append(files)
2024-12-18 10:32:24 +08:00
return generated_files
elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
return process_single_file(input_path, output_folder, selection)
2024-10-29 20:40:14 +08:00
else:
2024-12-18 10:32:24 +08:00
print("提供的路径既不是文件夹也不是PDF文件。")
return [""] * (2 if selection == 4 else 1)
except Exception as e:
print(f"Error in truncate_pdf_main_goods: {e}")
return [""] * (2 if selection == 4 else 1)
2024-10-29 09:04:23 +08:00
2025-01-03 17:36:23 +08:00
2024-08-29 16:37:09 +08:00
if __name__ == "__main__":
2025-01-03 17:36:23 +08:00
logger = get_global_logger("123")
2024-11-01 14:28:10 +08:00
start_time = time.time()
2024-12-11 17:42:51 +08:00
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件-通产丽星高端化妆品研发生产总部基地高低压配电工程.pdf"
2024-12-17 18:44:58 +08:00
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
2024-12-11 17:42:51 +08:00
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\货物标\output2"
selection = 2 # 例如1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-invalid
2025-01-03 17:36:23 +08:00
generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection, logger)
2024-12-18 16:01:32 +08:00
print(generated_files)
2024-10-31 20:12:08 +08:00
# print("生成的文件:", generated_files)
2024-11-01 14:28:10 +08:00
end_time = time.time()
print("耗时:" + str(end_time - start_time))