zbparse/flask_app/工程标/截取pdf工程标版.py
2024-12-18 10:32:24 +08:00

473 lines
26 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#flask_app/工程标/截取pdf工程标版.py
import regex
import os
import time
from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.clean_pdf import clean_page_content
from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages
from flask_app.general.通用功能函数 import get_global_logger
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
is_secondary_match):
pdf_document = PdfReader(pdf_path)
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
def run_extraction():
start_page = None
mid_page = None
end_page = None
chapter_type = None # 用于存储“章”或“部分”
combined_mid_pattern= None
end_pattern = None
for i, page in enumerate(pdf_document.pages):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
continue
if start_page is None:
match = regex.search(begin_pattern, cleaned_text)
if match and i > begin_page:
start_page = i
matched_text: str = match.group(0) # 获取整个匹配的文本
if '' in matched_text:
chapter_type = ''
elif '部分' in matched_text:
chapter_type = '部分'
else:
chapter_type = None # 未匹配到“章”或“部分”
if chapter_type:
# 根据 chapter_type 动态生成 end_pattern
if not is_secondary_match:
end_pattern = regex.compile(
rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|'
r'^评标(方法|办法)前附表|'
r'^附录(?:一)?[:]|'
r'^附件(?:一)?[:]|'
r'^附表(?:一)?[:]',
regex.MULTILINE
)
else:
end_pattern = regex.compile(
rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$'
)
# 根据 chapter_type 动态生成 additional_mid_pattern
if chapter_type == '':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
elif chapter_type == '部分':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
else:
additional_mid_pattern = ''
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|' \
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释)' \
r'|(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$'
# 合并基础模式和额外模式
if additional_mid_pattern:
combined_mid_pattern = regex.compile(
rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})',
regex.MULTILINE
)
else:
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
regex.MULTILINE
)
# print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern
else:
# 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern
if not is_secondary_match:
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|'
r'^评标(方法|办法)前附表|' # Match "评标办法前附表" at the beginning of a line
r'^附录(?:一)?[:]|' # Match "附录一" with optional "一" and colon
r'^附件(?:一)?[:]|' # Match "附件一" with optional "一" and colon
r'^附表(?:一)?[:]', # Match "附表一" with optional "一" and colon
regex.MULTILINE
)
else:
end_pattern = regex.compile(
rf'第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$'
)
# print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern
# 定义基础的 mid_pattern
base_mid_pattern = r'^\s*(?:[(]\s*[一1]?\s*[)]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)'
combined_mid_pattern = regex.compile(
rf'{base_mid_pattern}',
regex.MULTILINE
)
continue
if start_page is not None and mid_page is None and combined_mid_pattern:
if regex.search(combined_mid_pattern, cleaned_text):
mid_page = i
if start_page is not None and mid_page is not None and chapter_type:
if regex.search(end_pattern, cleaned_text):
if i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
# 运行提取
start_page, mid_page, end_page = run_extraction()
if start_page is None or end_page is None or mid_page is None:
print(f"first: tobidders_notice 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return "", ""
path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_table", common_header)
path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice", common_header)
return path1, path2
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix, common_header):
# 打开PDF文件
pdf_document = PdfReader(pdf_path)
start_page = None
end_page = None
exclusion_pattern = regex.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成')
# 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
if regex.search(exclusion_pattern, cleaned_text):
continue
if regex.search(begin_pattern, cleaned_text) and i >= begin_page:
if start_page and (output_suffix == "notice" or output_suffix == "invalid"):
pass
else:
start_page = i
if start_page is not None and regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern,cleaned_text):
condition = i > start_page
if condition:
is_invalid_condition = output_suffix == "invalid" and i > 30 # 这边默认无效投标至少有30页
if is_invalid_condition or output_suffix != "invalid":
end_page = i
break # 找到结束页后退出循环
# 移除对 extract_pages_twice 的调用
if start_page is None or end_page is None:
print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!")
return [""]
else:
return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index):
"""
处理 PDF 文件,作为 truncate_pdf_main 的后备方法。
此函数将在所有模式对都失败后调用。
"""
try:
pdf_document = PdfReader(pdf_path)
if output_suffix == "qualification":
print("twice:qualificaiton!")
# 动态设置 include_keys
include_keys = ["资格审查", "资质审查", "符合性审查", "资格性检查", "符合性检查","资格检查","能力","信誉"]
# 定义起始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:])'
# 定义结束匹配模式 - 附录、附件、附表等(移除负向前瞻)
end_pattern_attachment = regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:]).*$',
regex.MULTILINE
)
# 定义结束匹配模式 - 章节标题、评标附表、投标人须知等
end_pattern_chapter = regex.compile(
r'第[一二三四五六七八九]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|'
r'\s*评标(?:办法|方法)前附表\s*$|'
r'投标人须知',
regex.MULTILINE
)
start_page = None
end_page = None
# 从指定的开始页开始遍历
for i, page in enumerate(pdf_document.pages[last_begin_index:], start=last_begin_index):
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 确定起始页需在last_begin_index之后
if any(key in cleaned_text for key in include_keys):
if regex.search(begin_pattern, cleaned_text, regex.MULTILINE):
if start_page is None:
start_page = i # 确保起始页不小于章节的开始页码
continue
# print(f"匹配到附录/附件/附表,设置起始页为: {start_page}")
# 确定结束页 - 附录、附件、附表等
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 额外检查当前页面是否不包含任何 include_keys
if not any(key in cleaned_text for key in include_keys):
if i > start_page:
end_page = i
# print(f"找到结束页 (附件类): {end_page}")
break # 找到结束页后退出循环
else:
print(f"当前页面匹配附件结束模式但包含 include_keys继续查找。页面: {i}")
# 确定结束页 - 章节标题、评标附表、投标人须知等
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
# print(f"找到结束页 (章节标题等): {end_page}")
break # 找到结束页后退出循环
if start_page is None or end_page is None:
print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return []
else:
return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
elif output_suffix == "invalid":
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
# 计算总页数的三分之二
total = int(total_pages * 2 / 3)
start_page = last_begin_index
end_page = min(90, total)
return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
else:
print(f"{output_suffix} twice: 未定义的输出后缀。")
return []
except Exception as e:
print(f"Error in extract_pages_twice: {e}")
return []
def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,output_suffix="default"):
try:
# 内嵌的处理单个文件的函数
def process_single_file(file_path, output_folder, selection):
try:
# 获取起始和通用页眉
common_header, last_begin_index = get_start_and_common_header(file_path, 20)
if selection == 1:
# Selection 1: 招标公告
pattern_pairs = [
(
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
),
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE
)
),
(
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
),
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
regex.MULTILINE
)
)
]
output_suffix = "notice"
elif selection == 2:
# Selection 2: 评标办法
pattern_pairs = [
(
regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))',
regex.MULTILINE
),
# Alternative end pattern
regex.compile(
r'^第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+|[:]清标报告\s*$|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)评标(方法|办法)正文\s*$|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:评标|定标)详细程序\s*$',
regex.MULTILINE
)
),
(
regex.compile(
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*'
r'[\u4e00-\u9fff、()]*?'
r'(?=.*(?:磋商|谈判|评标|评定|评审))'
r'(?=.*(?:办法|方法))'
r'[\u4e00-\u9fff、()]*\s*$|'
r'\s*评标(办法|方法)前附表\s*$',
regex.MULTILINE
),
regex.compile(
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
)
]
output_suffix = "evaluation_method"
elif selection == 3:
# Selection 3: 资格审查条件
pattern_pairs = [
(
regex.compile(
r'^(?:附录(?:[一1])?[:]|附件(?:[一1])?[:]|附表(?:[一1])?[:]).*(?:资质|能力|信誉).*$|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*'
r'[\u4e00-\u9fff、()]*资格[\u4e00-\u9fff、()]*\s*$',
regex.MULTILINE
),
regex.compile(
r'^(?:附录[一二三四五六七八九1-9]*[:]|附件[一二三四五六七八九1-9]*[:]|附表[一二三四五六七八九1-9]*[:])(?!.*(?:资质|能力|信誉)).*|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+',
regex.MULTILINE
)
)
]
output_suffix = "qualification"
elif selection == 4:
# Selection 4: 投标人须知
pattern_pairs = [
(
regex.compile(
r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|'
r'(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',
regex.MULTILINE
),
regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE
)
),
(
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
regex.MULTILINE
),
regex.compile(
r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[:]|^附件(?:一)?[:]|^附表(?:一)?[:]',
regex.MULTILINE
)
)
]
output_suffix = "tobidders_notice"
elif selection == 5:
# Selection 5: 无效标
pattern_pairs = [
(
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
),
regex.compile(
r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
regex.MULTILINE
)
),
(
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
),
regex.compile(
r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
regex.MULTILINE
)
)
]
output_suffix = "invalid"
else:
print("无效的选择:请选择1-5")
return [""]
for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1):
is_secondary_match = (idx == 2)
begin_page = last_begin_index if last_begin_index != 0 else {
1: 0, # 公告
2: 10, # 评标
3: 5, # 资格
4: 3, # 前附表
5: 0 # 无效标
}.get(selection, 0)
if selection == 4: # 投标人须知
output_paths = list(
extract_pages_tobidders_notice(
file_path,
output_folder,
begin_pattern,
begin_page,
common_header,
is_secondary_match
)
)
if output_paths and any(os.path.isfile(f) for f in output_paths):
return output_paths
else:
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
continue
output_paths = extract_pages(
file_path,
output_folder,
begin_pattern,
begin_page=begin_page,
end_pattern=end_pattern,
output_suffix=output_suffix,
common_header=common_header
)
if output_paths and any(os.path.isfile(f) for f in output_paths):
# print(f"Selection {selection}: 使用模式对 {idx} 成功提取。")
return output_paths # 成功提取后立即返回
else:
# print(f"Selection {selection}: 使用模式对 {idx} 失败。尝试下一个模式对。")
continue
# 如果所有模式对都失败,尝试回退
print(f"Selection {selection}: 所有模式对都未匹配成功。尝试执行 extract_pages_twice 或返回空。")
fallback_result = extract_pages_twice(
file_path,
output_folder,
output_suffix,
common_header,
last_begin_index
)
if fallback_result and any(os.path.isfile(f) for f in fallback_result):
return fallback_result
else:
# 根据 selection 返回相应数量的空字符串
empty_returns = ["", ""] if selection == 4 else [""]
logger.error(f"Selection {selection}: 回退提取失败,返回空字符串。")
return empty_returns
except Exception as e:
logger.error(f"Error in processing file '{file_path}': {e}")
return [""] * (2 if selection == 4 else 1)
# 检查是否为文件夹
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
if file_name.endswith('.pdf'):
file_path = os.path.join(input_path, file_name)
files = process_single_file(file_path, output_folder, selection)
if files:
generated_files.extend(files)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
return process_single_file(input_path, output_folder, selection)
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return [""] * (2 if selection == 4 else 1)
except Exception as e:
print(f"Error in truncate_pdf_main_goods: {e}")
return [""] * (2 if selection == 4 else 1)
if __name__ == "__main__":
logger=get_global_logger("123")
start_time = time.time()
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4_evaluation_method.pdf"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
# selection = 1 # 例如1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
# print(generated_files)
# print("生成的文件:", generated_files)
end_time = time.time()
print("耗时:" + str(end_time - start_time))