794 lines
37 KiB
Python
794 lines
37 KiB
Python
# flask_app/general/截取pdf通用函数.py
|
||
import concurrent.futures
|
||
import io
|
||
import os
|
||
|
||
import fitz
|
||
import regex
|
||
from PyPDF2 import PdfReader, PdfWriter
|
||
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
|
||
from flask_app.general.format_change import docx2pdf
|
||
import concurrent.futures
|
||
|
||
|
||
# 定义一个函数用于提取文本
|
||
def get_text_pypdf2(pdf, page_num):
|
||
return pdf.pages[page_num].extract_text() or ""
|
||
|
||
def get_text_fitz(pdf, page_num):
|
||
return pdf.load_page(page_num).get_text() or ""
|
||
# 定义一个内部函数来绑定 pdf_document 对象
|
||
def create_get_text_function(pdf_lib, pdf_doc):
|
||
if pdf_lib == 'pypdf2':
|
||
return lambda page_num: get_text_pypdf2(pdf_doc, page_num)
|
||
elif pdf_lib == 'fitz':
|
||
return lambda page_num: get_text_fitz(pdf_doc, page_num)
|
||
else:
|
||
raise ValueError("不支持的 PDF 库。")
|
||
|
||
def get_start_and_common_header(input_path, end_page):
|
||
"""
|
||
提取 PDF 文件的公共页眉和起始页索引。
|
||
|
||
参数:
|
||
- input_path: PDF 文件路径。
|
||
- end_page: 遍历的结束页码。
|
||
|
||
返回:
|
||
- common_header: 公共页眉内容。
|
||
- last_begin_index: 起始页的页码索引。
|
||
"""
|
||
pdf_document = None
|
||
try:
|
||
# 提取公共页眉
|
||
|
||
common_header = extract_common_header(input_path) # 假设该函数已定义
|
||
|
||
last_begin_index = -1
|
||
begin_pattern = regex.compile(
|
||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
# 新增目录匹配模式
|
||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||
try:
|
||
# 尝试使用 PyPDF2 读取 PDF
|
||
pdf_document = PdfReader(input_path)
|
||
total_pages = len(pdf_document.pages)
|
||
get_text = create_get_text_function('pypdf2', pdf_document)
|
||
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||
except Exception as e_pypdf2:
|
||
print(f"get_start_and_common_header:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||
try:
|
||
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
|
||
pdf_document = fitz.open(input_path)
|
||
total_pages = pdf_document.page_count
|
||
get_text = create_get_text_function('fitz', pdf_document)
|
||
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||
except Exception as e_fitz:
|
||
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||
return common_header, -1 # 或者根据需求抛出异常
|
||
|
||
# 遍历页面
|
||
for i in range(total_pages):
|
||
if i > end_page:
|
||
return common_header, -1 # 如果页码大于 end_page,直接返回 last_begin_index=0
|
||
text = get_text(i)
|
||
if text:
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
# 检查是否存在"目录"
|
||
if catalog_pattern.search(cleaned_text):
|
||
# print(f"页面 {i} 包含目录,跳过。")
|
||
continue # 如果存在目录,跳过当前页面
|
||
if begin_pattern.search(cleaned_text):
|
||
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
||
# print(f"匹配到起始模式,设置 last_begin_index 为: {last_begin_index}")
|
||
return common_header, last_begin_index
|
||
return common_header, last_begin_index
|
||
except Exception as e:
|
||
print(f"Error in get_start_and_common_header: {e}")
|
||
return "", -1 # 根据需求调整返回值
|
||
finally:
|
||
# 如果使用的是 PyMuPDF,记得关闭文档
|
||
if pdf_document and hasattr(pdf_document, "close"):
|
||
pdf_document.close()
|
||
|
||
|
||
def check_pdf_pages(pdf_path, mode, logger):
|
||
pdf_document = None
|
||
try:
|
||
try:
|
||
# 尝试使用 PyPDF2 读取 PDF(新版可直接传入文件路径)
|
||
pdf_document = PdfReader(pdf_path)
|
||
num_pages = len(pdf_document.pages)
|
||
# logger.info(f"使用 PyPDF2 成功读取 PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||
except Exception as e_pypdf2:
|
||
logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||
# 如果 PyPDF2 读取失败,尝试使用 PyMuPDF 读取 PDF
|
||
pdf_document = fitz.open(pdf_path)
|
||
num_pages = pdf_document.page_count
|
||
|
||
if num_pages <= 30:
|
||
logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。")
|
||
if mode == 'goods':
|
||
return True, ['', '', '', '', '', '', pdf_path, '']
|
||
else:
|
||
return True, ['', '', '', '', '', pdf_path, '']
|
||
|
||
# 若页数大于30页,返回 False 表示继续处理
|
||
return False, []
|
||
|
||
except Exception as e:
|
||
logger.error(f"无法读取 PDF 页数: {e}")
|
||
if mode == 'goods':
|
||
return True, ['', '', '', '', '', '', pdf_path, '']
|
||
else:
|
||
return True, ['', '', '', '', '', pdf_path, '']
|
||
|
||
finally:
|
||
# 如果使用的是 PyMuPDF 或其他需要关闭的对象,确保关闭资源
|
||
if pdf_document and hasattr(pdf_document, "close"):
|
||
pdf_document.close()
|
||
|
||
|
||
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
|
||
pdf_document = None
|
||
using_pypdf2 = False
|
||
|
||
try:
|
||
if start_page is None or end_page is None:
|
||
print("错误: start_page 或 end_page 为 None")
|
||
return ""
|
||
|
||
# 尝试使用 PyPDF2 读取 PDF(新版可以直接传入路径)
|
||
try:
|
||
pdf_document = PdfReader(pdf_path)
|
||
total_pages = len(pdf_document.pages)
|
||
# 定义提取文本函数
|
||
get_text = create_get_text_function('pypdf2', pdf_document)
|
||
using_pypdf2 = True
|
||
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||
except Exception as e_pypdf2:
|
||
print(f"save_extracted_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
|
||
try:
|
||
pdf_document = fitz.open(pdf_path)
|
||
total_pages = pdf_document.page_count
|
||
get_text = create_get_text_function('fitz', pdf_document)
|
||
using_pypdf2 = False
|
||
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||
except Exception as e_fitz:
|
||
print(f"save_extracted_pages: 使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||
return ""
|
||
|
||
# 检查页面范围的有效性
|
||
if start_page < 0 or end_page >= total_pages or start_page > end_page:
|
||
print(f"无效的页面范围: {start_page} 到 {end_page}")
|
||
return ""
|
||
|
||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
||
|
||
# 如果 output_suffix 为 'notice' 并且 start_page > 0,则另存“目录前”的页面
|
||
if output_suffix == 'notice' and start_page > 0:
|
||
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
||
toc_page = -1
|
||
|
||
for page_num in range(min(start_page, total_pages)):
|
||
try:
|
||
text = get_text(page_num)
|
||
except Exception as e_extract:
|
||
print(f"提取第 {page_num} 页文本时出错: {e_extract}")
|
||
continue
|
||
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
|
||
toc_page = page_num
|
||
break
|
||
|
||
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
|
||
|
||
if using_pypdf2:
|
||
before_doc = PdfWriter()
|
||
for page_num in range(pages_to_extract):
|
||
try:
|
||
before_doc.add_page(pdf_document.pages[page_num])
|
||
except Exception as e_page:
|
||
print(f"添加第 {page_num} 页到 before_doc 时出错: {e_page}")
|
||
continue
|
||
try:
|
||
with open(before_pdf_path, 'wb') as f_before:
|
||
before_doc.write(f_before)
|
||
print(f"目录前的页面已保存到 {before_pdf_path}")
|
||
except Exception as e_write_before:
|
||
print(f"保存目录前的页面时出错: {e_write_before}")
|
||
else:
|
||
before_doc = fitz.open()
|
||
for page_num in range(pages_to_extract):
|
||
try:
|
||
before_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
|
||
except Exception as e_page:
|
||
print(f"添加第 {page_num} 页到 before_doc 时出错: {e_page}")
|
||
continue
|
||
try:
|
||
before_doc.save(before_pdf_path)
|
||
before_doc.close()
|
||
print(f"目录前的页面已保存到 {before_pdf_path}")
|
||
except Exception as e_write_before:
|
||
print(f"保存目录前的页面时出错: {e_write_before}")
|
||
|
||
# 提取并保存目标页面
|
||
if using_pypdf2:
|
||
output_doc = PdfWriter()
|
||
for page_num in range(start_page, end_page + 1):
|
||
try:
|
||
output_doc.add_page(pdf_document.pages[page_num])
|
||
except Exception as e_page:
|
||
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
|
||
continue
|
||
try:
|
||
with open(output_pdf_path, 'wb') as f_out:
|
||
output_doc.write(f_out)
|
||
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||
return output_pdf_path
|
||
except Exception as e_write:
|
||
print(f"保存截取的页面时出错: {e_write}")
|
||
return ""
|
||
else:
|
||
output_doc = fitz.open()
|
||
try:
|
||
for page_num in range(start_page, end_page + 1):
|
||
try:
|
||
output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
|
||
except Exception as e_page:
|
||
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
|
||
continue
|
||
output_doc.save(output_pdf_path)
|
||
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||
return output_pdf_path
|
||
except Exception as e_write:
|
||
print(f"保存截取的页面时出错: {e_write}")
|
||
return ""
|
||
finally:
|
||
output_doc.close()
|
||
except Exception as e:
|
||
print(f"发生未知错误: {e}")
|
||
return ""
|
||
finally:
|
||
# 如果使用的是 PyMuPDF 或其他需要关闭的对象,确保关闭资源
|
||
if pdf_document and not using_pypdf2 and hasattr(pdf_document, "close"):
|
||
pdf_document.close()
|
||
|
||
def is_pdf_or_doc(filename):
|
||
# 判断文件是否为PDF或Word文档
|
||
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
||
|
||
def convert_to_pdf(file_path):
|
||
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
|
||
if file_path.lower().endswith(('.doc', '.docx')):
|
||
return docx2pdf(file_path)
|
||
return file_path
|
||
|
||
def get_invalid_file(pdf_path, output_folder, common_header, begin_page):
|
||
"""
|
||
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
||
begin_pattern匹配开头 end_pattern匹配结尾
|
||
页数小于100不进行begin_pattern,默认start_page=0
|
||
页数大于200时end_pattern从前往后匹配
|
||
Args:
|
||
pdf_path (str): 输入的PDF文件路径。
|
||
output_folder (str): 提取后文件的输出文件夹路径。
|
||
common_header (str): 公共头部文本,用于清理每页的内容。
|
||
|
||
Returns:
|
||
list: 包含保存的文件路径的列表,如果提取失败则返回包含空字符串的列表。
|
||
"""
|
||
# 定义开始模式
|
||
begin_patterns = [
|
||
regex.compile(
|
||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||
regex.MULTILINE
|
||
),
|
||
regex.compile(
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
|
||
regex.MULTILINE
|
||
)
|
||
]
|
||
# 定义结束模式
|
||
end_patterns = [
|
||
regex.compile(
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
|
||
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[::]清标报告',
|
||
regex.MULTILINE
|
||
),
|
||
regex.compile(
|
||
r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$",
|
||
regex.MULTILINE
|
||
),
|
||
]
|
||
|
||
# 定义排除模式
|
||
exclusion_pattern = regex.compile(
|
||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||
|
||
pdf_document = None
|
||
try:
|
||
# 尝试使用 PyPDF2 读取 PDF
|
||
try:
|
||
pdf_document = PdfReader(pdf_path)
|
||
get_text = create_get_text_function('pypdf2', pdf_document)
|
||
total_pages = len(pdf_document.pages)
|
||
except Exception as e:
|
||
pdf_document = fitz.open(pdf_path)
|
||
get_text = create_get_text_function('fitz', pdf_document)
|
||
total_pages = pdf_document.page_count
|
||
|
||
if total_pages <= 50:
|
||
print(f"PDF页数({total_pages}) <= 50,直接返回文件路径: {pdf_path}")
|
||
return pdf_path, 0
|
||
|
||
# 提取并清理每页的文本内容
|
||
page_texts = []
|
||
for i in range(total_pages):
|
||
text = get_text(i)
|
||
cleaned_text = clean_page_content(text, common_header) if text else ""
|
||
page_texts.append(cleaned_text)
|
||
|
||
# 内部函数:查找起始页
|
||
def find_start_page(begin_page):
|
||
for pattern in begin_patterns: # 需提前定义好
|
||
for i in range(begin_page, min(begin_page + 30, total_pages)):
|
||
text = page_texts[i]
|
||
# 若存在排除模式则跳过
|
||
if regex.search(exclusion_pattern, text):
|
||
continue
|
||
if regex.search(pattern, text):
|
||
# print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||
return i
|
||
# print(f"未在前30页找到模式: {pattern.pattern}")
|
||
return 0 # 默认从第一页开始
|
||
|
||
# 内部函数:查找结束页
|
||
def find_end_page():
|
||
if total_pages > 200:
|
||
# 当页数较多时,从第31页开始正向查找结束模式
|
||
for pattern in end_patterns: # 需提前定义好
|
||
for i in range(30, total_pages):
|
||
text = page_texts[i]
|
||
# 定义局部的排除模式
|
||
exclusion_pat = regex.compile(
|
||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
if regex.search(exclusion_pat, text):
|
||
continue
|
||
if regex.search(pattern, text):
|
||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||
return i
|
||
else:
|
||
# 当页数较少时,从后向前查找(至少保留前30页)
|
||
for pattern in end_patterns:
|
||
for i in range(total_pages - 1, 29, -1):
|
||
text = page_texts[i]
|
||
if regex.search(pattern, text):
|
||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||
return i
|
||
|
||
# 若未匹配到结束模式,设置默认结束页
|
||
if total_pages > 200:
|
||
print(f"未找到结束模式,总页数大于200,平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}。{pdf_path}")
|
||
return max(100, int(total_pages * 1 / 2))
|
||
elif 100 < total_pages <= 200:
|
||
print("未找到结束模式,总页数在100到200之间,设置结束页为总页数的三分之二。 " + pdf_path)
|
||
return max(100, int(total_pages * 2 / 3))
|
||
else:
|
||
print("未找到结束模式,设置结束页为最后一页。 " + pdf_path)
|
||
return total_pages - 1
|
||
|
||
# 根据总页数决定是否并发查找起始页与结束页
|
||
if total_pages < 100:
|
||
start_page_found = 0
|
||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||
future_end = executor.submit(find_end_page)
|
||
end_page_found = future_end.result()
|
||
else:
|
||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||
future_start = executor.submit(find_start_page, begin_page)
|
||
future_end = executor.submit(find_end_page)
|
||
start_page_found = future_start.result()
|
||
end_page_found = future_end.result()
|
||
|
||
# 检查页码范围是否有效
|
||
if start_page_found > end_page_found:
|
||
print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})")
|
||
return "", 0
|
||
|
||
# 调用已实现的保存函数,保存提取的页面
|
||
output_path = save_extracted_pages(
|
||
pdf_path=pdf_path,
|
||
output_folder=output_folder,
|
||
start_page=start_page_found,
|
||
end_page=end_page_found,
|
||
output_suffix="invalid",
|
||
common_header=common_header
|
||
)
|
||
return output_path, end_page_found
|
||
|
||
except Exception as e:
|
||
print(f"处理文件 {pdf_path} 时发生错误: {e}")
|
||
return "", 0
|
||
|
||
finally:
|
||
# 确保释放资源(对 fitz 的 pdf_document 需要关闭)
|
||
if pdf_document and hasattr(pdf_document, "close"):
|
||
pdf_document.close()
|
||
|
||
|
||
def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
||
output_suffix="normal", invalid_endpage=-1):
|
||
start_page = None
|
||
end_page = None
|
||
flag = True
|
||
# 先尝试使用 PyPDF2
|
||
try:
|
||
pdf_document = PdfReader(pdf_path)
|
||
total_pages = len(pdf_document.pages)
|
||
get_text = create_get_text_function('pypdf2', pdf_document)
|
||
except Exception as e_pypdf2:
|
||
print(f"extract_pages_generic:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||
# 如果 PyPDF2 失败,就再试 PyMuPDF
|
||
try:
|
||
pdf_document = fitz.open(pdf_path)
|
||
total_pages = pdf_document.page_count
|
||
get_text = create_get_text_function('fitz', pdf_document)
|
||
except Exception as e_fitz:
|
||
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||
return None, None
|
||
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
||
for i in range(begin_page, end_limit):
|
||
text = get_text(i)
|
||
if text:
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
if output_suffix == "tobidders_notice2": #extract_pages_twice_tobidders_notice会调用该代码
|
||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern,cleaned_text):
|
||
flag = False
|
||
continue
|
||
if begin_page == 0 and catalog_pattern.search(cleaned_text):
|
||
continue # 如果存在目录,跳过当前页面
|
||
else:
|
||
# 一般投标文件的编制、组成在'投标人须知前附表/正文'中,需要防止begin_pattern匹配到这部分内容,所以在start_page is None的时候使用该exclusion_pattern
|
||
if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
|
||
flag = False
|
||
continue
|
||
if start_page is None and regex.search(begin_pattern, cleaned_text):
|
||
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
|
||
start_page = i
|
||
continue
|
||
if start_page is not None:
|
||
if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]: # 使用列表判断多个匹配项
|
||
# 判断 end_pattern 是否匹配且当前页大于起始页
|
||
if regex.search(end_pattern, cleaned_text) and i > start_page:
|
||
end_page = i
|
||
break
|
||
else:
|
||
# 如果 end_pattern 匹配且 begin_pattern 不匹配
|
||
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
|
||
end_page = i
|
||
break
|
||
# 如果使用的是 PyMuPDF,需要手动关闭文档
|
||
if pdf_document and isinstance(pdf_document, fitz.Document):
|
||
pdf_document.close()
|
||
return start_page, end_page
|
||
|
||
|
||
def generate_mid_pattern(chapter_type=None):
|
||
"""
|
||
根据章节类型动态生成 mid_pattern。
|
||
|
||
参数:
|
||
chapter_type (str, optional): 章节类型,如 '章' 或 '部分'。
|
||
|
||
返回:
|
||
regex.Pattern: 编译后的正则表达式模式。
|
||
"""
|
||
# 定义基础的 mid_pattern
|
||
base_mid_pattern = (
|
||
r'^\s*(?:[((]\s*[一二12]?\s*[))]\s*[、..]*|'
|
||
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释|定\s*义)|'
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
|
||
)
|
||
|
||
additional_mid_pattern = ''
|
||
if chapter_type:
|
||
if chapter_type == '章':
|
||
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
|
||
elif chapter_type == '部分':
|
||
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
|
||
|
||
if additional_mid_pattern:
|
||
combined_pattern = rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})'
|
||
else:
|
||
combined_pattern = base_mid_pattern
|
||
|
||
return regex.compile(combined_pattern, regex.MULTILINE)
|
||
|
||
|
||
def generate_end_pattern(extraction_stage, chapter_type=None):
|
||
"""
|
||
根据提取阶段和章节类型动态生成 end_pattern。
|
||
|
||
参数:
|
||
extraction_stage (str): 提取阶段标记,如 'first' 表示第一次,'third' 表示第三次。
|
||
chapter_type (str, optional): 章节类型,如 '章' 或 '部分'。
|
||
|
||
返回:
|
||
regex.Pattern: 编译后的正则表达式模式。
|
||
"""
|
||
# 定义公共匹配模式
|
||
common_patterns = (
|
||
r'^附录(?:一)?[::]|'
|
||
r'^附件(?:一)?[::]|'
|
||
r'^附表(?:一)?[::]'
|
||
)
|
||
|
||
# 定义 end_pattern 模板
|
||
end_pattern_template_first = (
|
||
r'^第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff]+'
|
||
r'|' + common_patterns
|
||
)
|
||
|
||
|
||
end_pattern_template_third = (
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()()/]+\s*$'
|
||
r'|' + common_patterns
|
||
)
|
||
|
||
# 选择模板并替换 {chapter_type}
|
||
if extraction_stage == 'third':
|
||
template = end_pattern_template_third
|
||
else:
|
||
template = end_pattern_template_first
|
||
|
||
if chapter_type:
|
||
pattern_str = template.format(chapter_type=chapter_type)
|
||
else:
|
||
pattern_str = template.format(chapter_type='章|部分')
|
||
|
||
return regex.compile(pattern_str, regex.MULTILINE)
|
||
|
||
|
||
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage=-1):
|
||
"""
|
||
从PDF文档中提取起始页、中间页和结束页。
|
||
如果第一次提取失败,则使用新的 begin_pattern 和 end_pattern 重新提取。
|
||
"""
|
||
exclusion_pattern = regex.compile(
|
||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||
def run_extraction(begin_pattern, extraction_stage='first'):
|
||
"""
|
||
使用提供的 begin_pattern 运行提取过程。
|
||
根据 extraction_stage 判断是“第一次提取”还是“第三次提取”,
|
||
从而选择对应的默认 end_pattern(动态生成)。
|
||
|
||
参数:
|
||
begin_pattern (regex.Pattern): 用于识别起始的正则表达式模式。
|
||
extraction_stage (str): 提取阶段标记,如 'first' 表示第一次,'third' 表示第三次。
|
||
|
||
返回:
|
||
tuple: (start_page, mid_page, end_page)
|
||
"""
|
||
|
||
# ===== 函数内部变量 =====
|
||
start_page = None
|
||
mid_page = None
|
||
end_page = None
|
||
combined_mid_pattern = None # 中间页的组合模式
|
||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
||
# ===== 遍历每一页,执行提取逻辑 =====
|
||
for i in range(end_limit):
|
||
try:
|
||
text = get_text(i)
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
except Exception as e:
|
||
print(f"提取第 {i} 页文本时出错: {e}")
|
||
continue
|
||
|
||
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
|
||
if exclusion_pattern and mid_page is not None and regex.search(exclusion_pattern, cleaned_text):
|
||
continue
|
||
|
||
# 如果为第 0 页之后的目录,直接跳过
|
||
if begin_page == 0 and catalog_pattern.search(cleaned_text):
|
||
continue
|
||
|
||
# ========== 识别起始页 ==========
|
||
if start_page is None:
|
||
match = regex.search(begin_pattern, cleaned_text)
|
||
if match and i > begin_page:
|
||
start_page = i
|
||
matched_text = match.group(0) # 获取整个匹配的文本
|
||
# 根据匹配到的内容识别是“章”还是“部分”,用于后续组合 mid_pattern
|
||
if '章' in matched_text:
|
||
chapter_type = '章'
|
||
elif '部分' in matched_text:
|
||
chapter_type = '部分'
|
||
else:
|
||
chapter_type = None # 未匹配到“章”或“部分”
|
||
|
||
# ===== 动态生成 end_pattern_dynamic =====
|
||
end_pattern_dynamic = generate_end_pattern(extraction_stage, chapter_type)
|
||
# ========== 动态生成 mid_pattern ==========
|
||
combined_mid_pattern = generate_mid_pattern(chapter_type)
|
||
# 找到 start_page 后,继续循环,进入下一页
|
||
continue
|
||
|
||
# ========== 识别中间页 ==========
|
||
if start_page is not None and mid_page is None and combined_mid_pattern:
|
||
if regex.search(combined_mid_pattern, cleaned_text):
|
||
mid_page = i
|
||
|
||
# ========== 识别结束页 ==========
|
||
if start_page is not None and mid_page is not None:
|
||
# 当前使用的 end_pattern(上面根据 extraction_stage 和 chapter_type 选择好了)
|
||
if regex.search(end_pattern_dynamic, cleaned_text):
|
||
if i > mid_page:
|
||
end_page = i
|
||
break
|
||
|
||
return start_page, mid_page, end_page
|
||
|
||
def perform_extraction_and_save(start_page, mid_page, end_page):
|
||
path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_part1",
|
||
common_header)
|
||
if mid_page != end_page:
|
||
path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice_part2",
|
||
common_header)
|
||
else:
|
||
path2 = path1
|
||
return path1, path2
|
||
|
||
pdf_document = None
|
||
try:
|
||
try:
|
||
pdf_document = PdfReader(pdf_path)
|
||
get_text = create_get_text_function('pypdf2', pdf_document)
|
||
total_pages = len(pdf_document.pages)
|
||
except Exception as e_pypdf2:
|
||
print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||
try:
|
||
pdf_document = fitz.open(pdf_path)
|
||
get_text = create_get_text_function('fitz', pdf_document)
|
||
total_pages = pdf_document.page_count
|
||
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||
except Exception as e_fitz:
|
||
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||
return "", "" # 或者根据需求抛出异常
|
||
|
||
# 第一次提取尝试,使用初始的 begin_pattern
|
||
begin_pattern = regex.compile(
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$|'
|
||
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
start_page, mid_page, end_page = run_extraction(begin_pattern, extraction_stage='first')
|
||
# 如果第一次提取成功,保存并返回路径
|
||
if start_page is not None and mid_page is not None and end_page is not None:
|
||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||
|
||
print(f"第一次提取tobidders_notice失败,尝试第二次提取: {pdf_path}")
|
||
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page)
|
||
|
||
if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
|
||
if end_page1==start_page2:
|
||
mid_page=end_page1
|
||
return perform_extraction_and_save(start_page1, mid_page, end_page2)
|
||
else:
|
||
path1=save_extracted_pages(pdf_path,output_folder,start_page1,end_page1,"tobidders_notice_part1",common_header)
|
||
path2=save_extracted_pages(pdf_path,output_folder,start_page2,end_page2,"tobidders_notice_part2",common_header)
|
||
return path1,path2
|
||
|
||
print("第二次提取tobidders_notice失败,尝试第三次提取!")
|
||
new_begin_pattern = regex.compile(
|
||
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
start_page, mid_page, end_page = run_extraction(new_begin_pattern, extraction_stage='third')
|
||
if start_page is not None and mid_page is not None and end_page is not None:
|
||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||
else:
|
||
# 所有提取尝试均失败
|
||
print(f"三次提取tobidders_notice均失败!! {pdf_path}")
|
||
return "", ""
|
||
except Exception as e:
|
||
print(f"处理文件 {pdf_path} 时发生错误: {e}")
|
||
return "", ""
|
||
finally:
|
||
# 无论如何都关闭 pdf_document(对于 fitz 对象)
|
||
if pdf_document and hasattr(pdf_document, "close"):
|
||
pdf_document.close()
|
||
|
||
|
||
|
||
def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
|
||
# 使用 with open 确保文件流在读取后关闭,避免文件句柄泄露
|
||
try:
|
||
pdf_document = PdfReader(pdf_path)
|
||
except Exception as e:
|
||
print(f"读取 PDF 文件失败: {e}")
|
||
return None, None, None, None
|
||
# 投标须知前附表占一章、投标须知正文占一章
|
||
output_suffix = "tobidders_notice"
|
||
begin_pattern = regex.compile(
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
|
||
regex.MULTILINE
|
||
)
|
||
second_begin_pattern = regex.compile(
|
||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)', regex.MULTILINE # 捕获中文部分
|
||
)
|
||
end_pattern = second_begin_pattern
|
||
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"] # 在这里添加需要排除的关键词
|
||
|
||
exclusion_pattern = regex.compile(
|
||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||
|
||
# 提取第一部分
|
||
start_page1, end_page1 = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
|
||
exclusion_pattern, output_suffix)
|
||
if start_page1 is None or end_page1 is None:
|
||
return None, None, None,None
|
||
|
||
# 提取第二部分 先检查end_page1页面的内容
|
||
text = get_text_pypdf2(pdf_document,end_page1)
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
match = end_pattern.search(cleaned_text)
|
||
if match:
|
||
# 获取匹配到的中文部分
|
||
chapter_title = match.group(1) if match.groups() else ""
|
||
# 检查是否包含排除关键词
|
||
if any(word in chapter_title for word in exclusion_words):
|
||
# 如果包含排除关键词,直接返回相同的路径
|
||
second_begin_pattern=begin_pattern #投标人须知前附表和投标人须知正文不是紧挨着的!eg:(发布稿)恒丰理财有限责任公司智能投研终端项目竞争性磋商
|
||
|
||
# 如果不包含排除关键词,继续提取第二部分
|
||
output_suffix='tobidders_notice2'
|
||
begin_page=end_page1-1 #传入start_page2-1是因为非notice截取->start_page>begin_page
|
||
start_page2, end_page2 = extract_pages_generic(pdf_path, second_begin_pattern, end_pattern, begin_page, common_header,
|
||
exclusion_pattern, output_suffix)
|
||
|
||
if start_page2 is None or end_page2 is None:
|
||
return start_page1, end_page1, end_page1,end_page1
|
||
|
||
return start_page1, end_page1, start_page2,end_page2
|
||
|
||
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
|
||
begin_pattern = regex.compile(
|
||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||
regex.MULTILINE
|
||
)
|
||
end_pattern = regex.compile(
|
||
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
|
||
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]+\s*$|' # 第二部分
|
||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
|
||
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$', # 新增部分
|
||
regex.MULTILINE
|
||
)
|
||
exclusion_pattern = regex.compile(
|
||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||
output_suffix = 'notice'
|
||
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
|
||
exclusion_pattern, output_suffix, invalid_endpage)
|
||
if start_page is None or end_page is None:
|
||
print(f"{output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
|
||
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
file_path = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf'
|
||
output_folder = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp'
|
||
# res=get_invalid_file(file_path,output_folder,"",0)
|