243 lines
14 KiB
Python
243 lines
14 KiB
Python
# flask_app/工程标/截取pdf工程标版.py
|
||
import regex
|
||
import os
|
||
import time
|
||
from PyPDF2 import PdfReader
|
||
from flask_app.general.clean_pdf import clean_page_content
|
||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, get_invalid_file, \
|
||
extract_pages_tobidders_notice, get_notice, extract_pages_generic
|
||
from flask_app.general.通用功能函数 import get_global_logger
|
||
|
||
|
||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,invalid_endpage=-1):
|
||
try:
|
||
pdf_document = PdfReader(pdf_path)
|
||
exclusion_pattern = regex.compile(
|
||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page,
|
||
common_header, exclusion_pattern, output_suffix, invalid_endpage)
|
||
if start_page is not None and end_page is not None:
|
||
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
|
||
else:
|
||
return ""
|
||
except Exception as e:
|
||
print(f"Error processing {pdf_path}: {e}")
|
||
return ""
|
||
|
||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
|
||
"""
|
||
处理 PDF 文件,作为 truncate_pdf_main 的后备方法。
|
||
此函数将在所有模式对都失败后调用。
|
||
"""
|
||
try:
|
||
pdf_document = PdfReader(pdf_path)
|
||
if output_suffix == "qualification":
|
||
print("最后一次尝试获取qualification!")
|
||
# 动态设置 include_keys
|
||
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查", "资格检查", "能力","信誉"]
|
||
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||
begin_pattern = r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::])'
|
||
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
|
||
end_pattern_attachment = regex.compile(
|
||
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::]).*$',
|
||
regex.MULTILINE
|
||
)
|
||
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
|
||
end_pattern_chapter = regex.compile(
|
||
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
start_page = None
|
||
end_page = None
|
||
# 从指定的开始页开始遍历
|
||
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
|
||
text = page.extract_text()
|
||
if text:
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
# 确定起始页,需在last_begin_index之后
|
||
if any(key in cleaned_text for key in include_keys):
|
||
if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
|
||
if start_page is None:
|
||
start_page = i # 确保起始页不小于章节的开始页码
|
||
continue
|
||
# print(f"匹配到附录/附件/附表,设置起始页为: {start_page}")
|
||
# 确定结束页 - 附录、附件、附表等
|
||
if start_page is not None and end_pattern_attachment.search(cleaned_text):
|
||
# 额外检查当前页面是否不包含任何 include_keys
|
||
if not any(key in cleaned_text for key in include_keys):
|
||
if i > start_page:
|
||
end_page = i
|
||
# print(f"找到结束页 (附件类): {end_page}")
|
||
break # 找到结束页后退出循环
|
||
else:
|
||
print(f"当前页面匹配附件结束模式但包含 include_keys,继续查找。页面: {i}")
|
||
# 确定结束页 - 章节标题、评标附表、投标人须知等
|
||
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
|
||
if i > start_page:
|
||
end_page = i
|
||
# print(f"找到结束页 (章节标题等): {end_page}")
|
||
break # 找到结束页后退出循环
|
||
if start_page is None or end_page is None:
|
||
print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!")
|
||
return ""
|
||
else:
|
||
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
|
||
return ""
|
||
except Exception as e:
|
||
print(f"Error in extract_pages_twice: {e}")
|
||
return ""
|
||
|
||
|
||
def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
|
||
try:
|
||
# 内嵌的处理单个文件的函数
|
||
def process_single_file(pdf_path, output_folder, selection):
|
||
try:
|
||
# 获取起始和通用页眉
|
||
common_header, last_begin_index = get_start_and_common_header(pdf_path, 20)
|
||
begin_page = last_begin_index if last_begin_index != 0 else {
|
||
1: 0, # 公告
|
||
2: 5, # 评标
|
||
3: 5, # 资格
|
||
4: 0, # 投标人须知
|
||
5: 0 # 无效投标
|
||
}.get(selection, 0)
|
||
# print(last_begin_index)
|
||
if selection == 1:
|
||
# 招标公告
|
||
path=get_notice(pdf_path,output_folder,begin_page,common_header,invalid_endpage)
|
||
return [path or ""]
|
||
elif selection == 2:
|
||
# Selection 2: 评标办法
|
||
pattern_pairs = [
|
||
(
|
||
regex.compile(
|
||
r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
|
||
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
|
||
r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
|
||
regex.MULTILINE
|
||
),
|
||
# Alternative end pattern
|
||
regex.compile(
|
||
r'^第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
|
||
r'[::]清标报告\s*$|'
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文(?:部分)?\s*$|'
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
),
|
||
(
|
||
regex.compile(
|
||
r'(?:(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
|
||
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()()]*?)'
|
||
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))' # 修改的部分
|
||
r'[\u4e00-\u9fff、()()]*\s*$|'
|
||
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!"\s*)(?<!"\s*)评标(方法|办法)前附表\s*$)',
|
||
# 第二种模式
|
||
regex.MULTILINE
|
||
),
|
||
regex.compile(
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
)
|
||
]
|
||
output_suffix = "evaluation_method"
|
||
elif selection == 3:
|
||
# Selection 3: 资格审查条件
|
||
pattern_pairs = [
|
||
(
|
||
regex.compile(
|
||
r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|资格).*$|'
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*'
|
||
r'[\u4e00-\u9fff、()()]*资格[\u4e00-\u9fff、()()]*\s*$',
|
||
regex.MULTILINE
|
||
),
|
||
regex.compile(
|
||
r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|资格)).*|'
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
||
regex.MULTILINE
|
||
)
|
||
)
|
||
]
|
||
output_suffix = "qualification"
|
||
elif selection == 4:
|
||
# 投标人须知
|
||
path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage)
|
||
return [path1 or "", path2 or ""]
|
||
elif selection == 5:
|
||
#无效标(投标文件格式或合同条款前的内容)
|
||
invalid_path, end_page = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
|
||
return [invalid_path or "", end_page]
|
||
else:
|
||
print("无效的选择:请选择1-5")
|
||
return [""]
|
||
for begin_pattern, end_pattern in pattern_pairs:
|
||
result = extract_pages(
|
||
pdf_path,
|
||
output_folder,
|
||
begin_pattern,
|
||
begin_page=begin_page,
|
||
invalid_endpage=invalid_endpage,
|
||
end_pattern=end_pattern,
|
||
output_suffix=output_suffix,
|
||
common_header=common_header
|
||
)
|
||
if result:
|
||
return [result]
|
||
else:
|
||
# print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
|
||
continue
|
||
# 如果所有模式对都失败,尝试回退
|
||
print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。{pdf_path}")
|
||
fallback_result = extract_pages_twice(
|
||
pdf_path,
|
||
output_folder,
|
||
output_suffix,
|
||
common_header,
|
||
last_begin_index
|
||
)
|
||
return [fallback_result or ""]
|
||
|
||
except Exception as e:
|
||
logger.error(f"Error in processing file '{pdf_path}': {e}")
|
||
return [""] * (2 if selection == 4 else 1)
|
||
|
||
# 检查是否为文件夹
|
||
if os.path.isdir(input_path):
|
||
generated_files = []
|
||
for file_name in os.listdir(input_path):
|
||
if file_name.endswith('.pdf'):
|
||
pdf_path = os.path.join(input_path, file_name)
|
||
files = process_single_file(pdf_path, output_folder, selection)
|
||
if isinstance(files, tuple):
|
||
generated_files.extend([f if f else "" for f in files])
|
||
else:
|
||
generated_files.append(files)
|
||
return generated_files
|
||
elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
|
||
return process_single_file(input_path, output_folder, selection)
|
||
else:
|
||
print("提供的路径既不是文件夹也不是PDF文件。")
|
||
return [""] * (2 if selection == 4 else 1)
|
||
except Exception as e:
|
||
print(f"Error in truncate_pdf_main_goods: {e}")
|
||
return [""] * (2 if selection == 4 else 1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
logger = get_global_logger("123")
|
||
start_time = time.time()
|
||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹"
|
||
|
||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||
output_folder = r"C:\Users\Administrator\Desktop\招标文件\output22"
|
||
selection = 2 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
||
generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection, logger)
|
||
print(generated_files)
|
||
# print("生成的文件:", generated_files)
|
||
end_time = time.time()
|
||
print("耗时:" + str(end_time - start_time))
|