zbparse/flask_app/货物标/截取pdf货物标版.py

322 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import fitz
from PyPDF2 import PdfReader
import regex # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.general.读取文件.clean_pdf import clean_page_content,create_get_text_function
from flask_app.general.merge_pdfs import merge_and_cleanup
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
get_invalid_file, extract_pages_tobidders_notice, extract_pages_generic, get_notice
from flask_app.general.通用功能函数 import get_global_logger
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, common_header,output_suffix,logger,invalid_endpage=-1):
try:
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page,
common_header, exclusion_pattern, output_suffix,invalid_endpage)
if start_page is None or end_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。")
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page,logger)
elif output_suffix == "qualification1":
truncate_pdf_main_goods(pdf_path, output_folder, 2, logger,"qualification3") # 合并'资格审查'章节和'评标办法'章节
return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return ""
def get_patterns_for_procurement():
begin_pattern = regex.compile(
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*(?!.*说明)' # 匹配“第X章”或“第X部分”
r'[\u4e00-\u9fff、()]*?' # 匹配允许的字符
r'(?:(?:服务|项目|商务|技术|供货)[\u4e00-\u9fff、()]*?要求[\u4e00-\u9fff、()]*?\s*$|' # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
r'(?:采购.*?(?:内容|要求|需求)|(招标|项目|货物)(?:内容|要求|需求))[\u4e00-\u9fff、()]*?|'
r'需求书[\u4e00-\u9fff、()]*?)\s*$',
regex.MULTILINE
)
end_pattern = regex.compile(r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern
def get_patterns_for_evaluation_method():
begin_pattern = regex.compile(
r'(?:(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!“\s*)(?<!“\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*' # 第一种模式
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:[\u4e00-\u9fff、()]*?)'
r'(?=.*(?:(?:磋商|谈判)(?=.*(?:办法|方法|内容))|(?:评标|评定|评审)))'
r'[\u4e00-\u9fff、()]*\s*$|'
r'^\s*(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!"\s*)(?<!“\s*)(?<!“\s*)评标(方法|办法)(?:前附表)?\s*$)', # 第二种模式
regex.MULTILINE
)
end_pattern = regex.compile(
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$', regex.MULTILINE)
return begin_pattern, end_pattern
def extract_pages_qualification(pdf_path, begin_page, common_header):
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = regex.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)(?!的)',
regex.MULTILINE
)
# 优先匹配模式,匹配以“资格性检查”、“资格审查”或“符合性审查”开头
priority_pattern = regex.compile(
r'^((资格|符合|资格性|符合性)(、(资格|符合|资格性|符合性))*)(检查|审查)(?:表|比对表|对照表)?\s*$',
regex.MULTILINE
)
# 结束匹配模式 - 附录、附件、附表等
end_pattern_attachment = regex.compile(
r'^(?:附录(?!的).*?|附件(?!的).*?|附表(?!的).*?|附件\s*\d+(?!的)).*',
regex.MULTILINE
)
# 结束匹配模式 - 章节标题
end_pattern_chapter = regex.compile(
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$',
regex.MULTILINE
)
print("第二次尝试 qualification:匹配附件")
include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"]
exclude_keywords = ["声明函", "承诺函"]
def _extract_pages_loop(get_text, total_pages, begin_page, common_header):
start_page = None
end_page = None
for i in range(begin_page, total_pages):
text = get_text(i)
if text:
cleaned_text = clean_page_content(text, common_header)
# 1. 优先判断是否匹配优先模式
priority_match = priority_pattern.search(cleaned_text)
if priority_match and start_page is None:
start_page = i
print(f"Matched priority_pattern, setting start_page to: {start_page}")
continue
else:
# 2. 若未匹配优先模式,则判断是否同时满足包含和排除关键词条件
if (any(keyword in cleaned_text for keyword in include_keywords) and
all(keyword not in cleaned_text for keyword in exclude_keywords) and
start_page is None):
if begin_pattern.search(cleaned_text):
start_page = i
print(f"Matched begin_pattern, setting start_page to: {start_page}")
continue
# 3. 判断结束页:附件/附表结束模式
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 若当前页面不包含任何 include_keywords则认为找到了结束页
if not any(keyword in cleaned_text for keyword in include_keywords):
if i > start_page:
end_page = i
print(f"Found end_page (attachment): {end_page}")
break
else:
print(f"Page {i} matches attachment pattern but contains include_keywords, continue.")
# 4. 判断结束页:章节标题结束模式
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
print(f"Found end_page (chapter title): {end_page}")
break
return start_page, end_page
try:
with open(pdf_path, "rb") as f:
pdf_document = PdfReader(f)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
return _extract_pages_loop(get_text, total_pages, begin_page, common_header)
except Exception as e_pypdf2:
print(f"extract_pages_qualification: Using PyPDF2 failed: {e_pypdf2}")
# 如果 PyPDF2 读取失败,尝试使用 PyMuPDF
try:
with fitz.open(pdf_path) as pdf_document:
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
return _extract_pages_loop(get_text, total_pages, begin_page, common_header)
except Exception as e_fitz:
print(f"Using PyMuPDF to read PDF also failed: {e_fitz}")
return None, None
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
try:
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
patterns = None
start_page = None
end_page = None
if output_suffix == "procurement":
patterns = [get_patterns_for_procurement()]
elif output_suffix == "evaluation_method" or output_suffix == "qualification2" or output_suffix == "qualification3":
patterns = [get_patterns_for_evaluation_method()]
elif output_suffix == "qualification1":
start_page, end_page = extract_pages_qualification(pdf_path, begin_page, common_header)
if patterns:
for pattern_pair in patterns:
start_page, end_page = extract_pages_generic(pdf_path, pattern_pair[0], pattern_pair[1], begin_page,
common_header, exclusion_pattern, output_suffix)
if start_page is not None and end_page is not None:
break
if start_page is None or end_page is None:
if output_suffix == "qualification1":
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
print("第三次尝试资格审查:尝试提取评分办法章节...")
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
evaluation_method_file = os.path.join(output_folder, f"{base_file_name}_evaluation_method.pdf")
if os.path.isfile(evaluation_method_file):
print(f"找到评分办法章节文件: {evaluation_method_file},生成新文件。")
# 获取文件路径和文件名
new_file_name = f"{base_file_name}_qualification2.pdf"
new_file_path = os.path.join(output_folder, new_file_name)
# 复制文件
with open(evaluation_method_file, 'rb') as original_file:
with open(new_file_path, 'wb') as new_file:
new_file.write(original_file.read())
return new_file_path
else:
temp = truncate_pdf_main_goods(pdf_path, output_folder, 2,logger, "qualification2")
if len(temp) > 0:
return temp[0]
else:
return ""
else:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header)
except Exception as e:
print(f"Error in extract_pages_twice: {e}")
return ""
def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_suffix="default",invalid_endpage=-1):
try:
# Function to handle processing of a single file
def process_single_file(pdf_path, output_folder, selection, output_suffix):
try:
# 创建输出文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 获取起始和通用页眉
common_header, last_begin_index = get_start_and_common_header(pdf_path, 20)
begin_page = last_begin_index if last_begin_index != -1 else {
4: 0,
2: 5,
3: 5,
1: 0,
5: 3,
6: 0 # Added default for selection 6 if needed
}.get(selection, 0)
# 根据选择设置对应的模式和结束模式
if selection == 1: #招标公告
path=get_notice(pdf_path, output_folder, begin_page,common_header, invalid_endpage)
return [path or ""]
elif selection == 2: #评标方法
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*'
r'(?<!"\s*)(?<!“\s*)(?<!”\s*)(?=.*(?:磋商(?=.*(?:办法|方法|内容))|'
r'谈判(?=.*(?:办法|方法|内容))|评标|评定|评审))',
regex.MULTILINE
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',regex.MULTILINE
)
local_output_suffix = "evaluation_method"
elif selection == 3: #资格审查
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', regex.MULTILINE
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE
)
local_output_suffix = "qualification1"
elif selection == 4: #投标人须知:前附表+须知正文
path1, path2 = extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage)
return [path1 or "", path2 or ""]
elif selection == 5: #采购需求 #第x章 招标需求 /项目需求 /货物需求...
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术|供货).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*(?:采购.*?(?:内容|要求|需求)|(招标|项目|货物)(?:内容|要求|需求)).*|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?需求书'
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',regex.MULTILINE
)
local_output_suffix = "procurement"
elif selection == 6: #除去投标文件格式之前的内容 或者 合同条款之前的内容
invalid_path, end_page = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
return [invalid_path or "", end_page]
else:
print("无效的选择:请选择1-6")
return ['']
# 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix
if output_suffix == "default":
output_suffix = local_output_suffix
result = extract_pages(
pdf_path,
output_folder,
begin_pattern,
begin_page,
end_pattern,
common_header,
output_suffix,
logger,
invalid_endpage
)
# 根据提取结果以及不同的 output_suffix 进行处理
if result:
if output_suffix == "qualification1":
merge_and_cleanup(result, "qualification3")
return [result or ""]
else:
return [result]
return [""]
except Exception as e:
logger.error(f"Error in processing file '{pdf_path}': {e}")
return [""] * (2 if selection == 4 else 1)
# 检查是否为文件夹
if os.path.isdir(input_path):
generated_files = []
for file_name in os.listdir(input_path):
if file_name.endswith('.pdf'):
pdf_path = os.path.join(input_path, file_name)
result = process_single_file(pdf_path, output_folder, selection, output_suffix)
if isinstance(result, tuple):
generated_files.extend([f if f else "" for f in result])
else:
generated_files.append(result)
return generated_files
# 单文件情况
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
return process_single_file(input_path, output_folder, selection, output_suffix)
else:
logger.error("提供的路径既不是文件夹也不是PDF文件。")
return [""] * (2 if selection == 4 else 1)
except Exception as e:
logger.error(f"Error in truncate_pdf_main_goods: {e}")
return [""] * (2 if selection == 4 else 1) # 返回空字符串
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章
#TODO: 验证所有文件的切分逻辑。
# ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__":
logger = get_global_logger("123")
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\货物标\output4"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 4 # 例如1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
print(generated_files)