zbparse/flask_app/general/截取pdf通用函数.py

769 lines
36 KiB
Python
Raw Normal View History

# flask_app/general/截取pdf通用函数.py
2024-12-17 18:44:58 +08:00
import concurrent.futures
import os
import fitz
2024-12-17 18:44:58 +08:00
import regex
2024-12-18 10:32:24 +08:00
from PyPDF2 import PdfReader, PdfWriter
2025-02-16 18:09:45 +08:00
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content, create_get_text_function, \
get_text_pypdf2
2024-12-18 10:32:24 +08:00
from flask_app.general.format_change import docx2pdf
2024-12-18 16:01:32 +08:00
import concurrent.futures
2024-12-18 10:32:24 +08:00
def get_start_and_common_header(input_path, end_page):
"""
提取 PDF 文件的公共页眉和起始页索引
参数
- input_path: PDF 文件路径
- end_page: 遍历的结束页码
返回
- common_header: 公共页眉内容
- last_begin_index: 起始页的页码索引
"""
try:
2025-02-16 18:09:45 +08:00
# 提取公共页眉(假设该函数已定义)
common_header = extract_common_header(input_path)
except Exception as e:
print(f"提取公共页眉时出错: {e}")
return "", -1
2025-02-16 18:09:45 +08:00
# 定义匹配模式
begin_pattern = regex.compile(
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$',
regex.MULTILINE
)
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
2025-02-16 18:09:45 +08:00
# 内部函数:对 PDF 每一页进行遍历处理
def process_pages(get_text, total_pages):
for i in range(total_pages):
if i > end_page:
2025-02-16 18:09:45 +08:00
return -1 # 超出 end_page 范围则返回 -1
text = get_text(i)
if text:
cleaned_text = clean_page_content(text, common_header)
2025-02-16 18:09:45 +08:00
# 如果页面中包含“目录”,则跳过
if catalog_pattern.search(cleaned_text):
2025-02-16 18:09:45 +08:00
continue
if begin_pattern.search(cleaned_text):
2025-02-16 18:09:45 +08:00
return i # 找到匹配的起始页返回页码从0开始
return -1 # 未找到匹配页
# 先尝试使用 PyPDF2 读取 PDF
try:
with open(input_path, "rb") as f:
pdf_document = PdfReader(f)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
last_begin_index = process_pages(get_text, total_pages)
return common_header, last_begin_index
except Exception as e_pypdf2:
print(f"get_start_and_common_header: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 如果 PyPDF2 读取失败,尝试使用 PyMuPDFfitz
try:
with fitz.open(input_path) as pdf_document:
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
last_begin_index = process_pages(get_text, total_pages)
return common_header, last_begin_index
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return common_header, -1
2024-12-17 18:44:58 +08:00
def check_pdf_pages(pdf_path, mode, logger):
2025-02-16 18:09:45 +08:00
def generate_invalid_return():
# 根据 mode 返回不同数量的空字符串及 pdf_path
return (True, ['', '', '', '', '', '', pdf_path, '']) if mode == 'goods' else (
True, ['', '', '', '', '', pdf_path, ''])
2024-12-17 18:44:58 +08:00
try:
try:
2025-02-16 18:09:45 +08:00
# 尝试使用 PyPDF2 读取 PDF新版可直接传入文件对象
with open(pdf_path, "rb") as f:
pdf_document = PdfReader(f)
num_pages = len(pdf_document.pages)
except Exception as e_pypdf2:
2025-02-14 15:46:07 +08:00
logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 如果 PyPDF2 读取失败,尝试使用 PyMuPDF 读取 PDF
2025-02-16 18:09:45 +08:00
with fitz.open(pdf_path) as pdf_document:
num_pages = pdf_document.page_count
2025-02-14 15:46:07 +08:00
2024-12-23 15:47:41 +08:00
if num_pages <= 30:
2025-01-02 11:28:38 +08:00
logger.info("PDFinvalid_path页数小于或等于30页跳过切分逻辑。")
2025-02-16 18:09:45 +08:00
return generate_invalid_return()
2025-02-14 15:46:07 +08:00
# 若页数大于30页返回 False 表示继续处理
2024-12-18 16:01:32 +08:00
return False, []
2024-12-17 18:44:58 +08:00
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
2025-02-16 18:09:45 +08:00
return generate_invalid_return()
2025-02-14 15:46:07 +08:00
2025-02-16 18:09:45 +08:00
def extract_and_save_pages(pdf_document, from_page, to_page, output_path, is_using_pypdf2):
"""
抽取 PDF 某个范围 [from_page, to_page] 的页面并保存到 output_path
根据 is_using_pypdf2 判断使用 PyPDF2 还是 PyMuPDF
"""
if is_using_pypdf2:
# 使用 PyPDF2
writer = PdfWriter()
for page_num in range(from_page, to_page + 1):
try:
writer.add_page(pdf_document.pages[page_num])
except Exception as e_page:
print(f"添加第 {page_num} 页到 writer 时出错: {e_page}")
continue
2025-02-14 15:46:07 +08:00
2025-02-16 18:09:45 +08:00
try:
with open(output_path, 'wb') as f_out:
writer.write(f_out)
print(f"已截取并保存页面 {from_page} - {to_page}{output_path}")
except Exception as e_write:
print(f"保存时出错: {e_write}")
2024-12-18 10:32:24 +08:00
return ""
2025-02-16 18:09:45 +08:00
return output_path
else:
# 使用 PyMuPDF
output_doc = fitz.open()
try:
2025-02-16 18:09:45 +08:00
for page_num in range(from_page, to_page + 1):
try:
output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
except Exception as e_page:
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
continue
2025-02-16 18:09:45 +08:00
output_doc.save(output_path)
print(f"已截取并保存页面 {from_page} - {to_page}{output_path}")
return output_path
except Exception as e_write:
print(f"保存时出错: {e_write}")
return ""
finally:
output_doc.close()
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
# 若 start_page 或 end_page 无效,直接退出
if start_page is None or end_page is None:
print("错误: start_page 或 end_page 为 None")
return ""
def process_document(pdf_document, total_pages, using_pypdf2, get_text):
# 检查页面范围
2024-12-18 10:32:24 +08:00
if start_page < 0 or end_page >= total_pages or start_page > end_page:
print(f"无效的页面范围: {start_page}{end_page}")
return ""
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
2025-02-16 18:09:45 +08:00
# 如果 output_suffix 为 'notice' 并且 start_page > 0则额外保存“目录前”的页面
2024-12-18 10:32:24 +08:00
if output_suffix == 'notice' and start_page > 0:
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
toc_page = -1
2025-02-16 18:09:45 +08:00
# 先遍历从 0 到 start_page - 1尝试查找“目录”文字
2024-12-18 10:32:24 +08:00
for page_num in range(min(start_page, total_pages)):
try:
text = get_text(page_num)
2025-02-16 18:09:45 +08:00
cleaned_text = clean_page_content(text, common_header)
if regex.search(r'\s*录', cleaned_text, regex.MULTILINE):
toc_page = page_num
break
except Exception as e_extract:
print(f"提取第 {page_num} 页文本时出错: {e_extract}")
continue
2025-02-14 15:46:07 +08:00
2025-02-16 18:09:45 +08:00
# toc_page 找到则提取到 toc_page否则提取到 start_page - 1
2024-12-18 10:32:24 +08:00
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
2025-02-16 18:09:45 +08:00
# 如果找到了页面可提取,且 pages_to_extract > 0
if pages_to_extract > 0:
extract_and_save_pages(pdf_document, 0, pages_to_extract - 1, before_pdf_path, using_pypdf2)
# 最后提取并保存目标页面
return extract_and_save_pages(pdf_document, start_page, end_page, output_pdf_path, using_pypdf2)
# 第一阶段:尝试使用 PyPDF2
try:
with open(pdf_path, "rb") as pdf_file:
# 这里构造 PdfReader确保上下文结束后 pdf_file 会被关闭
pdf_document = PdfReader(pdf_file)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
using_pypdf2 = True
# 在该 with 块内调用处理函数
return process_document(pdf_document, total_pages, using_pypdf2, get_text)
except Exception as e_pypdf2:
print(f"使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 第二阶段PyPDF2 失败后再尝试 PyMuPDF
try:
with fitz.open(pdf_path) as pdf_document:
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
using_pypdf2 = False
# 在该 with 块内调用处理函数
return process_document(pdf_document, total_pages, using_pypdf2, get_text)
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return ""
2024-12-18 10:32:24 +08:00
def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
2025-02-14 15:46:07 +08:00
def get_invalid_file(pdf_path, output_folder, common_header, begin_page):
2024-12-18 16:01:32 +08:00
"""
从指定的PDF文件中提取无效部分并保存到输出文件夹中
2025-01-17 15:45:53 +08:00
begin_pattern匹配开头 end_pattern匹配结尾
2024-12-18 16:01:32 +08:00
页数小于100不进行begin_pattern默认start_page=0
页数大于200时end_pattern从前往后匹配
Args:
2025-02-14 15:46:07 +08:00
pdf_path (str): 输入的PDF文件路径
2024-12-18 16:01:32 +08:00
output_folder (str): 提取后文件的输出文件夹路径
common_header (str): 公共头部文本用于清理每页的内容
Returns:
list: 包含保存的文件路径的列表如果提取失败则返回包含空字符串的列表
"""
# 定义开始模式
begin_patterns = [
2024-12-18 14:02:46 +08:00
regex.compile(
2025-01-17 15:45:53 +08:00
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$',
2024-12-18 16:01:32 +08:00
regex.MULTILINE
),
regex.compile(
2025-01-17 15:45:53 +08:00
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请',
2024-12-18 16:01:32 +08:00
regex.MULTILINE
)
2024-12-18 14:02:46 +08:00
]
2024-12-18 16:01:32 +08:00
# 定义结束模式
end_patterns = [
regex.compile(
2025-01-21 19:20:26 +08:00
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标|应答).*?(?:(?:文件|书)(?:的)?(?:组成|构成|编制)|格式).*|'
2024-12-25 14:35:52 +08:00
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[:]清标报告',
2024-12-18 16:01:32 +08:00
regex.MULTILINE
),
regex.compile(
r"\s*(投标文件|响应文件|响应性文件|应答文件)(?:的)?格式(?:及相关附件)?\s*$",
2024-12-18 16:01:32 +08:00
regex.MULTILINE
2025-01-17 15:45:53 +08:00
),
2024-12-18 16:01:32 +08:00
]
# 定义排除模式
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
2024-12-18 16:01:32 +08:00
2025-02-14 15:46:07 +08:00
pdf_document = None
2024-12-18 16:01:32 +08:00
try:
2025-02-16 18:09:45 +08:00
# 尝试使用 PyPDF2 读取 PDF使用 with open 打开文件
try:
2025-02-16 18:09:45 +08:00
with open(pdf_path, "rb") as f:
pdf_document = PdfReader(f)
get_text = create_get_text_function('pypdf2', pdf_document)
total_pages = len(pdf_document.pages)
if total_pages <= 50:
print(f"PDF页数({total_pages}) <= 50直接返回文件路径: {pdf_path}")
return pdf_path, 0
# 提取并清理每页的文本内容
page_texts = []
for i in range(total_pages):
text = get_text(i)
cleaned_text = clean_page_content(text, common_header) if text else ""
page_texts.append(cleaned_text)
except Exception as e:
2025-02-16 18:09:45 +08:00
# 如果 PyPDF2 读取失败,则使用 PyMuPDF 读取
with fitz.open(pdf_path) as pdf_document:
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
if total_pages <= 50:
print(f"PDF页数({total_pages}) <= 50直接返回文件路径: {pdf_path}")
return pdf_path, 0
# 提取并清理每页的文本内容
page_texts = []
for i in range(total_pages):
text = get_text(i)
cleaned_text = clean_page_content(text, common_header) if text else ""
page_texts.append(cleaned_text)
2024-12-18 16:01:32 +08:00
2025-02-14 15:46:07 +08:00
# 内部函数:查找起始页
2024-12-18 16:01:32 +08:00
def find_start_page(begin_page):
2025-02-14 15:46:07 +08:00
for pattern in begin_patterns: # 需提前定义好
2024-12-18 16:01:32 +08:00
for i in range(begin_page, min(begin_page + 30, total_pages)):
text = page_texts[i]
2025-02-14 15:46:07 +08:00
# 若存在排除模式则跳过
2024-12-18 16:01:32 +08:00
if regex.search(exclusion_pattern, text):
continue
if regex.search(pattern, text):
# print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
# print(f"未在前30页找到模式: {pattern.pattern}")
return 0 # 默认从第一页开始
2025-02-14 15:46:07 +08:00
# 内部函数:查找结束页
2024-12-18 16:01:32 +08:00
def find_end_page():
if total_pages > 200:
2025-02-14 15:46:07 +08:00
# 当页数较多时从第31页开始正向查找结束模式
for pattern in end_patterns: # 需提前定义好
2024-12-18 16:01:32 +08:00
for i in range(30, total_pages):
text = page_texts[i]
2025-02-14 15:46:07 +08:00
# 定义局部的排除模式
exclusion_pat = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$',
regex.MULTILINE
)
if regex.search(exclusion_pat, text):
2025-01-17 15:45:53 +08:00
continue
2024-12-18 16:01:32 +08:00
if regex.search(pattern, text):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
else:
2025-02-14 15:46:07 +08:00
# 当页数较少时从后向前查找至少保留前30页
2024-12-18 16:01:32 +08:00
for pattern in end_patterns:
for i in range(total_pages - 1, 29, -1):
text = page_texts[i]
if regex.search(pattern, text):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
2025-02-14 15:46:07 +08:00
# 若未匹配到结束模式,设置默认结束页
if total_pages > 200:
2025-02-14 15:46:07 +08:00
print(f"未找到结束模式总页数大于200平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}{pdf_path}")
return max(100, int(total_pages * 1 / 2))
elif 100 < total_pages <= 200:
2025-02-14 15:46:07 +08:00
print("未找到结束模式总页数在100到200之间设置结束页为总页数的三分之二。 " + pdf_path)
2024-12-18 16:01:32 +08:00
return max(100, int(total_pages * 2 / 3))
else:
2025-02-14 15:46:07 +08:00
print("未找到结束模式,设置结束页为最后一页。 " + pdf_path)
2024-12-18 16:01:32 +08:00
return total_pages - 1
2025-02-14 15:46:07 +08:00
# 根据总页数决定是否并发查找起始页与结束页
2024-12-18 16:01:32 +08:00
if total_pages < 100:
2025-02-14 15:46:07 +08:00
start_page_found = 0
2024-12-18 16:01:32 +08:00
with concurrent.futures.ThreadPoolExecutor() as executor:
future_end = executor.submit(find_end_page)
2025-02-14 15:46:07 +08:00
end_page_found = future_end.result()
2024-12-18 16:01:32 +08:00
else:
with concurrent.futures.ThreadPoolExecutor() as executor:
future_start = executor.submit(find_start_page, begin_page)
2024-12-18 16:01:32 +08:00
future_end = executor.submit(find_end_page)
2025-02-14 15:46:07 +08:00
start_page_found = future_start.result()
end_page_found = future_end.result()
# 检查页码范围是否有效
if start_page_found > end_page_found:
print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})")
return "", 0
2024-12-18 16:01:32 +08:00
2025-02-14 15:46:07 +08:00
# 调用已实现的保存函数,保存提取的页面
2024-12-18 16:01:32 +08:00
output_path = save_extracted_pages(
2025-02-14 15:46:07 +08:00
pdf_path=pdf_path,
2024-12-18 16:01:32 +08:00
output_folder=output_folder,
2025-02-14 15:46:07 +08:00
start_page=start_page_found,
end_page=end_page_found,
2024-12-18 16:01:32 +08:00
output_suffix="invalid",
common_header=common_header
)
2025-02-14 15:46:07 +08:00
return output_path, end_page_found
2024-12-18 16:01:32 +08:00
except Exception as e:
2025-02-14 15:46:07 +08:00
print(f"处理文件 {pdf_path} 时发生错误: {e}")
return "", 0
2025-02-14 15:46:07 +08:00
finally:
# 确保释放资源(对 fitz 的 pdf_document 需要关闭)
2025-02-16 18:09:45 +08:00
if pdf_document and isinstance(pdf_document, fitz.Document):
2025-02-14 15:46:07 +08:00
pdf_document.close()
2024-12-18 16:01:32 +08:00
2025-02-16 18:09:45 +08:00
def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
exclusion_pattern=None, output_suffix="normal", invalid_endpage=-1):
def process_pages(get_text, total_pages):
start_page = None
end_page = None
flag = True
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
for i in range(begin_page, end_limit):
text = get_text(i)
if text:
cleaned_text = clean_page_content(text, common_header)
if output_suffix == "tobidders_notice2":
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
if exclusion_pattern and flag and (start_page is not None) and regex.search(exclusion_pattern, cleaned_text):
flag = False
continue
if begin_page == 0 and catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
else:
# 防止匹配到正文前附表/正文部分
if exclusion_pattern and flag and (start_page is None) and regex.search(exclusion_pattern, cleaned_text):
flag = False
continue
if start_page is None and regex.search(begin_pattern, cleaned_text):
if (output_suffix == "notice" and i >= begin_page) or (output_suffix != "notice" and i > begin_page):
start_page = i
continue
if start_page is not None:
if output_suffix in ["tobidders_notice", "notice", "tobidders_notice2"]:
if regex.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
else:
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
end_page = i
break
return start_page, end_page
# 尝试使用 PyPDF2并使用 with open 打开文件
try:
2025-02-16 18:09:45 +08:00
with open(pdf_path, "rb") as f:
pdf_document = PdfReader(f)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
start_page, end_page = process_pages(get_text, total_pages)
return start_page, end_page
except Exception as e_pypdf2:
2025-02-16 18:09:45 +08:00
print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 如果 PyPDF2 失败,尝试使用 PyMuPDF
try:
with fitz.open(pdf_path) as pdf_document:
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
2025-02-16 18:09:45 +08:00
start_page, end_page = process_pages(get_text, total_pages)
return start_page, end_page
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return None, None
2025-01-09 16:45:55 +08:00
2025-01-13 11:27:52 +08:00
def generate_mid_pattern(chapter_type=None):
"""
根据章节类型动态生成 mid_pattern
参数:
chapter_type (str, optional): 章节类型 '' '部分'
返回:
regex.Pattern: 编译后的正则表达式模式
"""
# 定义基础的 mid_pattern
base_mid_pattern = (
r'^\s*(?:[(]\s*[一二12]?\s*[)]\s*[、..]*|'
r'[一二12][、..]+|[、..]+)\s*(说\s*明|总\s*则|名\s*词\s*解\s*释|定\s*义)|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人).*须知正文\s*$'
)
additional_mid_pattern = ''
if chapter_type:
if chapter_type == '':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)'
elif chapter_type == '部分':
additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:章)'
if additional_mid_pattern:
combined_pattern = rf'(?:{base_mid_pattern})|(?:{additional_mid_pattern})'
else:
combined_pattern = base_mid_pattern
return regex.compile(combined_pattern, regex.MULTILINE)
2025-01-13 11:27:52 +08:00
def generate_end_pattern(extraction_stage, chapter_type=None):
"""
根据提取阶段和章节类型动态生成 end_pattern
参数:
extraction_stage (str): 提取阶段标记 'first' 表示第一次'third' 表示第三次
chapter_type (str, optional): 章节类型 '' '部分'
返回:
regex.Pattern: 编译后的正则表达式模式
"""
2025-01-13 13:51:18 +08:00
# 定义公共匹配模式
common_patterns = (
2025-01-15 15:45:25 +08:00
r'^附录(?:一)?[:]|'
r'^附件(?:一)?[:]|'
r'^附表(?:一)?[:]'
2025-01-13 13:51:18 +08:00
)
2025-01-13 11:27:52 +08:00
# 定义 end_pattern 模板
end_pattern_template_first = (
r'^第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff]+'
r'|' + common_patterns
2025-01-13 11:27:52 +08:00
)
2025-01-17 15:45:53 +08:00
2025-01-13 11:27:52 +08:00
end_pattern_template_third = (
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)第[一二三四五六七八九十百千]+(?:{chapter_type})\s*[\u4e00-\u9fff、()/]+\s*$'
r'|' + common_patterns
2025-01-13 11:27:52 +08:00
)
# 选择模板并替换 {chapter_type}
if extraction_stage == 'third':
template = end_pattern_template_third
else:
template = end_pattern_template_first
if chapter_type:
pattern_str = template.format(chapter_type=chapter_type)
else:
pattern_str = template.format(chapter_type='章|部分')
return regex.compile(pattern_str, regex.MULTILINE)
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_header, invalid_endpage=-1):
2025-01-09 16:45:55 +08:00
"""
从PDF文档中提取起始页中间页和结束页
如果第一次提取失败则使用新的 begin_pattern end_pattern 重新提取
"""
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
2025-02-16 18:09:45 +08:00
def run_extraction(begin_pattern,get_text,total_pages,extraction_stage='first'):
2025-01-09 16:45:55 +08:00
"""
2025-01-13 11:27:52 +08:00
使用提供的 begin_pattern 运行提取过程
根据 extraction_stage 判断是第一次提取还是第三次提取
从而选择对应的默认 end_pattern动态生成
2025-01-09 16:45:55 +08:00
参数:
2025-01-13 11:27:52 +08:00
begin_pattern (regex.Pattern): 用于识别起始的正则表达式模式
extraction_stage (str): 提取阶段标记 'first' 表示第一次'third' 表示第三次
2025-01-09 16:45:55 +08:00
返回:
tuple: (start_page, mid_page, end_page)
"""
2025-01-13 11:27:52 +08:00
# ===== 函数内部变量 =====
2025-01-09 16:45:55 +08:00
start_page = None
mid_page = None
end_page = None
combined_mid_pattern = None # 中间页的组合模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
2025-01-13 11:27:52 +08:00
# ===== 遍历每一页,执行提取逻辑 =====
for i in range(end_limit):
try:
2025-02-14 15:46:07 +08:00
text = get_text(i)
cleaned_text = clean_page_content(text, common_header)
except Exception as e:
print(f"提取第 {i} 页文本时出错: {e}")
continue
2025-01-09 16:45:55 +08:00
# 如果已经找到中间页,且当前页匹配排除模式,则跳过
2025-01-17 10:47:43 +08:00
if exclusion_pattern and mid_page is not None and regex.search(exclusion_pattern, cleaned_text):
2025-01-09 16:45:55 +08:00
continue
2025-01-13 11:27:52 +08:00
# 如果为第 0 页之后的目录,直接跳过
2025-01-13 13:51:18 +08:00
if begin_page == 0 and catalog_pattern.search(cleaned_text):
2025-01-13 11:27:52 +08:00
continue
# ========== 识别起始页 ==========
2025-01-09 16:45:55 +08:00
if start_page is None:
2025-01-13 11:27:52 +08:00
match = regex.search(begin_pattern, cleaned_text)
2025-01-13 13:51:18 +08:00
if match and i > begin_page:
2025-01-09 16:45:55 +08:00
start_page = i
2025-01-13 11:27:52 +08:00
matched_text = match.group(0) # 获取整个匹配的文本
# 根据匹配到的内容识别是“章”还是“部分”,用于后续组合 mid_pattern
if '' in matched_text:
chapter_type = ''
elif '部分' in matched_text:
chapter_type = '部分'
2025-01-09 16:45:55 +08:00
else:
2025-01-13 11:27:52 +08:00
chapter_type = None # 未匹配到“章”或“部分”
# ===== 动态生成 end_pattern_dynamic =====
end_pattern_dynamic = generate_end_pattern(extraction_stage, chapter_type)
# ========== 动态生成 mid_pattern ==========
combined_mid_pattern = generate_mid_pattern(chapter_type)
# 找到 start_page 后,继续循环,进入下一页
2025-01-09 16:45:55 +08:00
continue
2025-01-13 11:27:52 +08:00
# ========== 识别中间页 ==========
2025-01-09 16:45:55 +08:00
if start_page is not None and mid_page is None and combined_mid_pattern:
if regex.search(combined_mid_pattern, cleaned_text):
mid_page = i
2025-01-13 11:27:52 +08:00
# ========== 识别结束页 ==========
2025-01-09 16:45:55 +08:00
if start_page is not None and mid_page is not None:
2025-01-13 11:27:52 +08:00
# 当前使用的 end_pattern上面根据 extraction_stage 和 chapter_type 选择好了)
if regex.search(end_pattern_dynamic, cleaned_text):
2025-01-09 16:45:55 +08:00
if i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
2025-01-10 17:38:55 +08:00
def perform_extraction_and_save(start_page, mid_page, end_page):
path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_part1",
common_header)
2025-01-10 17:38:55 +08:00
if mid_page != end_page:
path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice_part2",
common_header)
2025-01-10 17:38:55 +08:00
else:
path2 = path1
return path1, path2
2025-02-16 18:09:45 +08:00
def process_extraction(get_text, total_pages):
2025-02-14 15:46:07 +08:00
# 第一次提取尝试,使用初始的 begin_pattern
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$|'
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
regex.MULTILINE
)
2025-02-16 18:09:45 +08:00
start_page, mid_page, end_page = run_extraction(begin_pattern,get_text,total_pages, extraction_stage='first')
2025-02-14 15:46:07 +08:00
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
print(f"第一次提取tobidders_notice失败尝试第二次提取: {pdf_path}")
2025-02-16 18:09:45 +08:00
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(
pdf_path, common_header, begin_page
)
if (start_page1 is not None and end_page1 is not None and
start_page2 is not None and end_page2 is not None):
if end_page1 == start_page2:
mid_page = end_page1
2025-02-14 15:46:07 +08:00
return perform_extraction_and_save(start_page1, mid_page, end_page2)
else:
2025-02-16 18:09:45 +08:00
path1 = save_extracted_pages(
pdf_path, output_folder, start_page1, end_page1,
"tobidders_notice_part1", common_header
)
path2 = save_extracted_pages(
pdf_path, output_folder, start_page2, end_page2,
"tobidders_notice_part2", common_header
)
return path1, path2
2025-02-14 15:46:07 +08:00
print("第二次提取tobidders_notice失败尝试第三次提取!")
new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
regex.MULTILINE
)
2025-02-16 18:09:45 +08:00
start_page, mid_page, end_page = run_extraction(new_begin_pattern, get_text, total_pages, extraction_stage='third')
2025-02-14 15:46:07 +08:00
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
2025-01-17 15:45:53 +08:00
else:
2025-02-14 15:46:07 +08:00
print(f"三次提取tobidders_notice均失败!! {pdf_path}")
return "", ""
2025-02-16 18:09:45 +08:00
try:
try:
# 尝试使用 PyPDF2 读取 PDF 文件
with open(pdf_path, "rb") as f:
pdf_document = PdfReader(f)
get_text = create_get_text_function('pypdf2', pdf_document)
total_pages = len(pdf_document.pages)
return process_extraction(get_text, total_pages)
except Exception as e_pypdf2:
print(f"extract_pages_tobidders_notice: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 如果 PyPDF2 读取失败,则使用 PyMuPDF 读取 PDF 文件
with fitz.open(pdf_path) as pdf_document:
get_text = create_get_text_function('fitz', pdf_document)
total_pages = pdf_document.page_count
return process_extraction(get_text, total_pages)
2025-02-14 15:46:07 +08:00
except Exception as e:
print(f"处理文件 {pdf_path} 时发生错误: {e}")
2025-01-10 17:38:55 +08:00
return "", ""
2025-02-14 15:46:07 +08:00
2025-01-09 16:45:55 +08:00
def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
2025-02-14 15:46:07 +08:00
# 使用 with open 确保文件流在读取后关闭,避免文件句柄泄露
try:
2025-02-16 18:09:45 +08:00
with open(pdf_path, "rb") as f:
pdf_document = PdfReader(f)
# 内部函数:设置所有正则匹配模式和排除规则
def setup_patterns():
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知)+',
regex.MULTILINE
)
second_begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*([\u4e00-\u9fff]+)',
regex.MULTILINE
)
# 此处 end_pattern 默认采用 second_begin_pattern
end_pattern = second_begin_pattern
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$',
regex.MULTILINE
)
exclusion_words = ["合同", "评标", "开标", "评审", "采购", "资格"]
return begin_pattern, second_begin_pattern, end_pattern, exclusion_pattern, exclusion_words
# 初始化正则模式和排除规则
begin_pattern, second_begin_pattern, end_pattern, exclusion_pattern, exclusion_words = setup_patterns()
output_suffix = "tobidders_notice"
# 提取第一部分
start_page1, end_page1 = extract_pages_generic(
pdf_path, begin_pattern, end_pattern, begin_page, common_header,
exclusion_pattern, output_suffix
)
if start_page1 is None or end_page1 is None:
return None, None, None, None
# 提取第二部分:先检查 end_page1 页的内容
text = get_text_pypdf2(pdf_document, end_page1)
cleaned_text = clean_page_content(text, common_header)
match = end_pattern.search(cleaned_text)
if match:
# 获取匹配到的中文部分
chapter_title = match.group(1) if match.groups() else ""
# 检查是否包含排除关键词
if any(word in chapter_title for word in exclusion_words):
# 如果包含排除关键词,则将第二部分的起始模式设置为与第一部分相同
second_begin_pattern = begin_pattern
output_suffix = "tobidders_notice2"
# 对于第二部分,传入的 begin_page 改为 end_page1 - 1保证 start_page > begin_page
begin_page2 = end_page1 - 1
start_page2, end_page2 = extract_pages_generic(
pdf_path, second_begin_pattern, end_pattern, begin_page2, common_header,
exclusion_pattern, output_suffix
)
if start_page2 is None or end_page2 is None:
return start_page1, end_page1, end_page1, end_page1
return start_page1, end_page1, start_page2, end_page2
2025-02-14 15:46:07 +08:00
except Exception as e:
print(f"读取 PDF 文件失败: {e}")
return None, None, None, None
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
begin_pattern = regex.compile(
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$',
regex.MULTILINE
)
end_pattern = regex.compile(
r'^第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|' # 第一部分
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)' # 否定前瞻部分
r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()]+\s*$|' # 第二部分
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$', # 新增部分
regex.MULTILINE
)
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
output_suffix = 'notice'
start_page, end_page = extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header,
exclusion_pattern, output_suffix, invalid_endpage)
if start_page is None or end_page is None:
print(f"{output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)
2024-12-18 16:01:32 +08:00
if __name__ == "__main__":
file_path = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\ztbfile.pdf'
output_folder = r'D:\flask_project\flask_app\static\output\output1\2c4be864-bdab-405d-95cb-9d945d8627b3\tmp'
# res=get_invalid_file(file_path,output_folder,"",0)