zbparse/flask_app/general/截取pdf通用函数.py

266 lines
11 KiB
Python
Raw Normal View History

2024-12-17 18:44:58 +08:00
#flask_app/general/截取pdf通用函数.py
import concurrent.futures
import os
import regex
2024-12-18 10:32:24 +08:00
from PyPDF2 import PdfReader, PdfWriter
2024-12-17 18:44:58 +08:00
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
2024-12-18 10:32:24 +08:00
from flask_app.general.format_change import docx2pdf
2024-12-18 16:01:32 +08:00
import concurrent.futures
2024-12-18 10:32:24 +08:00
2024-12-17 18:44:58 +08:00
def get_start_and_common_header(input_path,end_page):
common_header = extract_common_header(input_path)
last_begin_index = 0
begin_pattern = regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
pdf_document = PdfReader(input_path)
for i, page in enumerate(pdf_document.pages):
if i > end_page:
return common_header, 0 # 如果页码大于end_page直接返回last_begin_index=0
text = page.extract_text()
if text:
cleaned_text = clean_page_content(text, common_header)
# 检查是否存在"目录"
if catalog_pattern.search(cleaned_text):
continue # 如果存在目录,跳过当前页面
if begin_pattern.search(cleaned_text):
last_begin_index = i # 更新第一个匹配的索引页码从0开始
return common_header, last_begin_index
return common_header, last_begin_index
2024-12-18 16:01:32 +08:00
def check_pdf_pages(pdf_path,mode, logger):
2024-12-17 18:44:58 +08:00
try:
reader = PdfReader(pdf_path)
num_pages = len(reader.pages)
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
2024-12-23 15:47:41 +08:00
if num_pages <= 30:
2024-12-17 18:44:58 +08:00
logger.info("PDF页数小于或等于50页跳过切分逻辑。")
2024-12-18 16:01:32 +08:00
if mode=='goods':
2024-12-23 15:47:41 +08:00
return True,['', '', '', '', '', '', pdf_path,'']
2024-12-18 16:01:32 +08:00
else:
2024-12-23 15:47:41 +08:00
return True,['', '', '', '', '', pdf_path, '']
2024-12-17 18:44:58 +08:00
# 若页数大于50页返回None表示继续处理
2024-12-18 16:01:32 +08:00
return False, []
2024-12-17 18:44:58 +08:00
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
# 返回空列表意味着无法执行后续处理逻辑
2024-12-18 16:01:32 +08:00
if mode == 'goods':
2024-12-23 15:47:41 +08:00
return True,['', '', '', '', '', '', pdf_path, '']
2024-12-18 16:01:32 +08:00
else:
2024-12-23 15:47:41 +08:00
return True,['', '', '', '', '', pdf_path, '']
2024-12-18 10:32:24 +08:00
def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header):
try:
# 检查 start_page 和 end_page 是否为 None
if start_page is None or end_page is None:
print("Error: start_page 或 end_page 为 None")
return ""
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
if start_page < 0 or end_page >= total_pages or start_page > end_page:
print(f"无效的页面范围: {start_page}{end_page}")
return ""
if output_suffix == 'notice' and start_page > 0:
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
before_doc = PdfWriter()
toc_page = -1
# 查找目录页
for page_num in range(min(start_page, total_pages)):
page_text = pdf_document.pages[page_num].extract_text()
cleaned_text = clean_page_content(page_text, common_header)
if regex.search(r'\s*录', cleaned_text, regex.MULTILINE):
toc_page = page_num
break
# 确定截取的页数
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
# 提取页面
for page_num in range(pages_to_extract):
before_doc.add_page(pdf_document.pages[page_num])
print(before_pdf_path)
with open(before_pdf_path, 'wb') as f_before:
before_doc.write(f_before)
output_doc = PdfWriter()
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"{output_suffix} 已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
except Exception as e:
print(f"Error in save_extracted_pages: {e}")
return "" # 返回空字符串
def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
2024-12-17 18:44:58 +08:00
2024-12-18 10:32:24 +08:00
def convert_to_pdf(file_path):
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
if file_path.lower().endswith(('.doc', '.docx')):
return docx2pdf(file_path)
return file_path
2024-12-17 18:44:58 +08:00
2024-12-18 16:01:32 +08:00
def get_invalid_file(file_path, output_folder, common_header,begin_page):
"""
从指定的PDF文件中提取无效部分并保存到输出文件夹中
begin_pattern从前往后匹配 end_pattern从后往前
页数小于100不进行begin_pattern默认start_page=0
页数大于200时end_pattern从前往后匹配
Args:
file_path (str): 输入的PDF文件路径
output_folder (str): 提取后文件的输出文件夹路径
common_header (str): 公共头部文本用于清理每页的内容
Returns:
list: 包含保存的文件路径的列表如果提取失败则返回包含空字符串的列表
"""
# 定义开始模式
begin_patterns = [
2024-12-18 14:02:46 +08:00
regex.compile(
2024-12-18 16:01:32 +08:00
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
regex.MULTILINE
),
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)
2024-12-18 14:02:46 +08:00
]
2024-12-18 16:01:32 +08:00
# 定义结束模式
end_patterns = [
regex.compile(
2024-12-25 14:35:52 +08:00
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[:]清标报告',
2024-12-18 16:01:32 +08:00
regex.MULTILINE
),
regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
regex.MULTILINE
),
regex.compile(
r"\s*(投标文件|响应文件|响应性文件)(?:的)?格式\s*",
regex.MULTILINE
)
]
# 定义排除模式
exclusion_pattern = regex.compile(
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成',
regex.MULTILINE
)
try:
# 打开PDF文件
pdf_document = PdfReader(file_path)
total_pages = len(pdf_document.pages)
# print(f"PDF总页数: {total_pages}")
# 提取并清理每页的文本内容
page_texts = []
for i in range(total_pages):
page = pdf_document.pages[i]
text = page.extract_text()
cleaned_text = clean_page_content(text, common_header) if text else ""
page_texts.append(cleaned_text)
# 定义查找起始页的函数
def find_start_page(begin_page):
for pattern in begin_patterns:
for i in range(begin_page, min(begin_page + 30, total_pages)):
text = page_texts[i]
if regex.search(exclusion_pattern, text):
continue
if regex.search(pattern, text):
# print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
# print(f"未在前30页找到模式: {pattern.pattern}")
return 0 # 默认从第一页开始
# 定义查找结束页的函数
def find_end_page():
if total_pages > 200:
# print("总页数大于200页结束页从前往后查找跳过前30页。")
for pattern in end_patterns:
for i in range(30, total_pages):
text = page_texts[i]
if regex.search(pattern, text):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
else:
# print("结束页从后往前查找确保只剩前30页时停止。")
for pattern in end_patterns:
for i in range(total_pages - 1, 29, -1):
text = page_texts[i]
if regex.search(pattern, text):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
# 如果没有匹配到设置end_page逻辑
if total_pages > 100:
print("未找到结束模式总页数大于100设置结束页为前100页。"+file_path)
return max(100, int(total_pages * 2 / 3))
else:
print("未找到结束模式,设置结束页为最后一页。"+file_path)
return total_pages - 1
# 根据总页数决定是否查找起始页
if total_pages < 100:
# print("总页数少于100页默认起始页为第一页。")
start_page = 0
with concurrent.futures.ThreadPoolExecutor() as executor:
future_end = executor.submit(find_end_page)
end_page = future_end.result()
else:
with concurrent.futures.ThreadPoolExecutor() as executor:
future_start = executor.submit(find_start_page,begin_page)
future_end = executor.submit(find_end_page)
start_page = future_start.result()
end_page = future_end.result()
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
# 验证页码范围
if start_page > end_page:
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
return [""]
# 调用已实现的保存函数
output_path = save_extracted_pages(
pdf_path=file_path,
output_folder=output_folder,
start_page=start_page,
end_page=end_page,
output_suffix="invalid",
common_header=common_header
)
# print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
return output_path
except Exception as e:
print(f"处理文件 {file_path} 时发生错误: {e}")
return ""
if __name__ == "__main__":
file_path=r'C:\Users\Administrator\Desktop\new招标文件\货物标\HBDL-2024-0310-001-招标文件.pdf'
output_folder=r'C:\Users\Administrator\Desktop\new招标文件\货物标\tmp'
res=get_invalid_file(file_path,output_folder,"")
2024-12-17 18:44:58 +08:00