zbparse/flask_app/货物标/截取pdf货物标版.py
2025-02-07 17:08:33 +08:00

330 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fitz
from PyPDF2 import PdfReader
import regex # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.general.读取文件.clean_pdf import clean_page_content
from flask_app.general.merge_pdfs import merge_and_cleanup
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice, create_get_text_function
from flask_app.general.通用功能函数 import get_global_logger
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,logger,invalid_endpage=-1):
try:
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix,invalid_endpage)
if start_page is None or end_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger)
elif output_suffix == "qualification1":
truncate_pdf_main_goods(pdf_path, output_folder, 2, logger,"qualification3") # 合并'资格审查'章节和'评标办法'章节
return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return ""
def get_patterns_for_procurement():
begin_pattern = regex.compile(
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*(?!.*说明)' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符
r'(?:(?:服务|项目|商务|技术|供货)[\u4e00-\u9fff、()]*?要求[\u4e00-\u9fff、()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
r'(?:采购.*?(?:内容|要求|需求)|招标(?:内容|要求|需求))[\u4e00-\u9fff、()]*?|'
r'需求书[\u4e00-\u9fff、()]*?)\s*$',
regex.MULTILINE
)
end_pattern = regex.compile(r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern
# """
# 第四章
# 评标
# """ 也能匹配上
def get_patterns_for_evaluation_method():
begin_pattern = regex.compile(
r'(?:(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!“\s*)(?<!“\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()]*?)'
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))'
r'[\u4e00-\u9fff、()]*\s*$|'
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!“\s*)(?<!“\s*)评标(方法|办法)(?:前附表)?\s*$)', # 第二种模式
regex.MULTILINE
)
end_pattern = regex.compile(
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern
def extract_pages_qualification(pdf_path, begin_page, common_header):
pdf_document = PdfReader(pdf_path)
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = regex.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)(?!的)',
regex.MULTILINE
)
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
priority_pattern = regex.compile(
r'^((资格|符合|资格性|符合性)(、(资格|符合|资格性|符合性))*)(检查|审查)(?:表|比对表|对照表)?\s*$',
regex.MULTILINE
)
# 结束匹配模式 - 附录、附件、附表等
end_pattern_attachment = regex.compile(
r'^(?:附录(?!的).*?|附件(?!的).*?|附表(?!的).*?|附件\s*\d+(?!的)).*',
regex.MULTILINE
)
# 结束匹配模式 - 章节标题
end_pattern_chapter = regex.compile(
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
print("第二次尝试 qualification:匹配附件")
start_page = None
end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"]
exclude_keywords = ["声明函", "承诺函"]
try:
# 尝试使用 PyPDF2 读取 PDF
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
# print("使用 PyPDF2 成功读取 PDF 文件。")
except Exception as e_pypdf2:
print(f"extract_pages_qualification:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
try:
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
pdf_document = fitz.open(pdf_path)
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
# print("使用 PyMuPDF 成功读取 PDF 文件。")
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return None, None # 或者根据需求抛出异常
# 从指定的开始页开始遍历
for i in range(begin_page, total_pages):
text = get_text(i)
if text:
cleaned_text = clean_page_content(text, common_header)
# 优先检查是否匹配优先模式
priority_match = priority_pattern.search(cleaned_text)
if priority_match and start_page is None:
start_page = i
print(f"匹配到priority_pattern设置起始页为: {start_page}")
continue
else:
# 如果未匹配优先模式,则按照一般模式进行判断
if (
any(keyword in cleaned_text for keyword in include_keywords) and
all(keyword not in cleaned_text for keyword in exclude_keywords) and
start_page is None
):
if begin_pattern.search(cleaned_text):
start_page = i
print(f"匹配到附录等模式begin_pattern设置起始页为: {start_page}")
continue
# 确定结束页 - 附录、附件、附表等
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 额外检查当前页面是否不包含任何 include_keywords
if not any(keyword in cleaned_text for keyword in include_keywords):
if i > start_page:
end_page = i
print(f"qualification找到结束页 (附件类): {end_page}")
break # 找到结束页后退出循环
else:
print(f"当前页面匹配附件结束模式但包含 include_keywords继续查找。页面: {i}")
# 确定结束页 - 章节标题
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
print(f"qualification找到结束页 (章节标题): {end_page}")
break # 找到结束页后退出循环
return start_page, end_page
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
try:
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
patterns = None
start_page = None
end_page = None
if output_suffix == "procurement":
patterns = [get_patterns_for_procurement()]
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
patterns = [get_patterns_for_evaluation_method()]
elif output_suffix == "qualification1":
start_page, end_page = extract_pages_qualification(pdf_path, begin_page, common_header)
if patterns:
for pattern_pair in patterns:
start_page, end_page = extract_pages_generic(pdf_path, pattern_pair[0], pattern_pair[1], begin_page,
common_header, exclusion_pattern, output_suffix)
if start_page is not None and end_page is not None:
break
if start_page is None or end_page is None:
if output_suffix == "qualification1":
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
print("第三次尝试资格审查:尝试提取评分办法章节...")
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
evaluation_method_file = os.path.join(output_folder, f"{base_file_name}_evaluation_method.pdf")
if os.path.isfile(evaluation_method_file):
print(f"找到评分办法章节文件: {evaluation_method_file},生成新文件。")
# 获取文件路径和文件名
new_file_name = f"{base_file_name}_qualification2.pdf"
new_file_path = os.path.join(output_folder, new_file_name)
# 复制文件
with open(evaluation_method_file, 'rb') as original_file:
with open(new_file_path, 'wb') as new_file:
new_file.write(original_file.read())
return new_file_path
else:
temp = truncate_pdf_main_goods(pdf_path, output_folder, 2,logger, "qualification2")
if len(temp) > 0:
return temp[0]
else:
return ""
else:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
except Exception as e:
print(f"Error in extract_pages_twice: {e}")
return ""
def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_suffix="default",invalid_endpage=-1):
try:
# Function to handle processing of a single file
def process_single_file(pdf_path, output_folder, selection, output_suffix):
try:
# 创建输出文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 获取起始和通用页眉
common_header, last_begin_index = get_start_and_common_header(pdf_path, 20)
begin_page = last_begin_index if last_begin_index != 0 else {
4: 0,
2: 5,
3: 5,
1: 0,
5: 3,
6: 0 # Added default for selection 6 if needed
}.get(selection, 0)
# 根据选择设置对应的模式和结束模式
if selection == 1: #招标公告
path=get_notice(pdf_path, output_folder, begin_page,common_header, invalid_endpage)
return [path or ""]
elif selection == 2: #评标方法
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
regex.MULTILINE
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',regex.MULTILINE
)
local_output_suffix = "evaluation_method"
elif selection == 3: #资格审查
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', regex.MULTILINE
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE
)
local_output_suffix = "qualification1"
elif selection == 4: #投标人须知:前附表+须知正文
path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage)
return [path1 or "", path2 or ""]
elif selection == 5: #采购需求
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|(招标|项目)(?:内容|要求|需求)).*|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',regex.MULTILINE
)
local_output_suffix = "procurement"
elif selection == 6: #除去投标文件格式之前的内容 或者 合同条款之前的内容
invalid_path, end_page = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
return [invalid_path or "", end_page]
else:
print("无效的选择:请选择1-6")
return ['']
# 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix
if output_suffix == "default":
output_suffix = local_output_suffix
result = extract_pages(
pdf_path,
output_folder,
begin_pattern,
begin_page,
end_pattern,
common_header,
output_suffix,
logger,
invalid_endpage
)
# 根据提取结果以及不同的 output_suffix 进行处理
if result:
if output_suffix == "qualification1":
merge_and_cleanup(result, "qualification3")
return [result or ""]
else:
return [result]
return [""]
except Exception as e:
logger.error(f"Error in processing file '{pdf_path}': {e}")
return [""] * (2 if selection == 4 else 1)
# 检查是否为文件夹
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
if file_name.endswith('.pdf'):
pdf_path = os.path.join(input_path, file_name)
result = process_single_file(pdf_path, output_folder, selection, output_suffix)
if isinstance(result, tuple):
generated_files.extend([f if f else "" for f in result])
else:
generated_files.append(result)
return generated_files
# 单文件情况
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
return process_single_file(input_path, output_folder, selection, output_suffix)
else:
logger.error("提供的路径既不是文件夹也不是PDF文件。")
return [""] * (2 if selection == 4 else 1)
except Exception as e:
logger.error(f"Error in truncate_pdf_main_goods: {e}")
return [""] * (2 if selection == 4 else 1) # 返回空字符串
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章
#TODO: 验证所有文件的切分逻辑。
# ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__":
logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件(107国道).pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\货物标\zbfiles\output6"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 6 # 例如1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
print(generated_files)