zbparse/flask_app/工程标/截取pdf工程标版.py
2025-02-07 15:27:24 +08:00

265 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# flask_app/工程标/截取pdf工程标版.py
import fitz
import regex
import os
import time
from PyPDF2 import PdfReader
from flask_app.general.读取文件.clean_pdf import clean_page_content
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \
extract_pages_tobidders_notice, get_notice, extract_pages_generic
from flask_app.general.通用功能函数 import get_global_logger
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1):
try:
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix, invalid_endpage)
if start_page is not None and end_page is not None:
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
else:
return ""
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return ""
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
"""
处理 PDF 文件,作为 truncate_pdf_main 的后备方法。
此函数将在所有模式对都失败后调用。
"""
try:
try:
pdf_document = PdfReader(pdf_path)
is_pypdf2 = True
total_pages = len(pdf_document.pages)
# print("使用 PyPDF2 成功读取 PDF 文件。")
except Exception as e_pypdf2:
print(f"extract_pages_twice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
try:
pdf_document = fitz.open(pdf_path)
is_pypdf2 = False
total_pages = pdf_document.page_count
# print("使用 PyMuPDF 成功读取 PDF 文件。")
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return ""
if output_suffix == "qualification":
print("最后一次尝试获取qualification!")
# 动态设置 include_keys
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力","信誉"]
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
end_pattern_attachment = regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:]).*$',
regex.MULTILINE
)
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
end_pattern_chapter = regex.compile(
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
start_page = None
end_page = None
def get_text(page_num):
try:
if is_pypdf2:
return pdf_document.pages[page_num].extract_text() or ""
else:
return pdf_document.load_page(page_num).get_text() or ""
except Exception as e:
print(f"提取第 {page_num} 页文本时出错: {e}")
return ""
# 从指定的开始页开始遍历
for i in range(last_begin_index, total_pages):
text = get_text(i)
if text:
cleaned_text = clean_page_content(text, common_header)
# 确定起始页需在last_begin_index之后
if any(key in cleaned_text for key in include_keys):
if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
if start_page is None:
start_page = i # 确保起始页不小于章节的开始页码
continue
# print(f"匹配到附录/附件/附表,设置起始页为: {start_page}")
# 确定结束页 - 附录、附件、附表等
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 额外检查当前页面是否不包含任何 include_keys
if not any(key in cleaned_text for key in include_keys):
if i > start_page:
end_page = i
# print(f"找到结束页 (附件类): {end_page}")
break # 找到结束页后退出循环
else:
print(f"当前页面匹配附件结束模式但包含 include_keys继续查找。页面: {i}")
# 确定结束页 - 章节标题、评标附表、投标人须知等
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
# print(f"找到结束页 (章节标题等): {end_page}")
break # 找到结束页后退出循环
if start_page is None or end_page is None:
print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
return ""
except Exception as e:
print(f"Error in extract_pages_twice: {e}")
return ""
def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
try:
# 内嵌的处理单个文件的函数
def process_single_file(pdf_path, output_folder, selection):
try:
# 获取起始和通用页眉
common_header, last_begin_index = get_start_and_common_header(pdf_path, 20)
begin_page = last_begin_index if last_begin_index != 0 else {
1: 0, # 公告
2: 5, # 评标
3: 5, # 资格
4: 0, # 投标人须知
5: 0 # 无效投标
}.get(selection, 0)
# print(last_begin_index)
if selection == 1:
# 招标公告
path=get_notice(pdf_path,output_folder,begin_page,common_header,invalid_endpage)
return [path or ""]
elif selection == 2:
# Selection 2: 评标办法
pattern_pairs = [
(
regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
regex.MULTILINE
),
# Alternative end pattern
regex.compile(
r'^第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'[:]清标报告\s*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文(?:部分)?\s*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',
regex.MULTILINE
)
),
(
regex.compile(
r'(?:(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()]*?)'
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))' # 修改的部分
r'[\u4e00-\u9fff、()]*\s*$|'
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)(?:前附表)?\s*$)',
# 第二种模式
regex.MULTILINE
),
regex.compile(
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
)
]
output_suffix = "evaluation_method"
elif selection == 3:
# Selection 3: 资格审查条件
pattern_pairs = [
(
regex.compile(
r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|资格).*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*'
r'[\u4e00-\u9fff、()]*资格[\u4e00-\u9fff、()]*\s*$',
regex.MULTILINE
),
regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|资格)).*|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+',
regex.MULTILINE
)
)
]
output_suffix = "qualification"
elif selection == 4:
# 投标人须知
path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage)
return [path1 or "", path2 or ""]
elif selection == 5: #除去投标文件格式之前的内容 或者 合同条款之前的内容
invalid_path, end_page = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
return [invalid_path or "", end_page]
else:
print("无效的选择:请选择1-5")
return [""]
for begin_pattern, end_pattern in pattern_pairs:
result = extract_pages(
pdf_path,
output_folder,
begin_pattern,
begin_page=begin_page,
invalid_endpage=invalid_endpage,
end_pattern=end_pattern,
output_suffix=output_suffix,
common_header=common_header
)
if result:
return [result]
else:
# print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
continue
# 如果所有模式对都失败,尝试回退
print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。{pdf_path}")
fallback_result = extract_pages_twice(
pdf_path,
output_folder,
output_suffix,
common_header,
last_begin_index
)
return [fallback_result or ""]
except Exception as e:
logger.error(f"Error in processing file '{pdf_path}': {e}")
return [""] * (2 if selection == 4 else 1)
# 检查是否为文件夹
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
if file_name.endswith('.pdf'):
pdf_path = os.path.join(input_path, file_name)
files = process_single_file(pdf_path, output_folder, selection)
if isinstance(files, tuple):
generated_files.extend([f if f else "" for f in files])
else:
generated_files.append(files)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
return process_single_file(input_path, output_folder, selection)
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return [""] * (2 if selection == 4 else 1)
except Exception as e:
print(f"Error in truncate_pdf_main_goods: {e}")
return [""] * (2 if selection == 4 else 1)
if __name__ == "__main__":
logger = get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
pdf_path=r"C:\Users\Administrator\Desktop\新建文件夹 (3)\新建文件夹\(存储存在问题)惠安县招标文件.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\新建文件夹"
selection = 4 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection, logger)
print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()
print("耗时:" + str(end_time - start_time))