2024-12-17 18:44:58 +08:00
|
|
|
|
#flask_app/general/截取pdf通用函数.py
|
|
|
|
|
import concurrent.futures
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
import regex
|
2024-12-18 10:32:24 +08:00
|
|
|
|
from PyPDF2 import PdfReader, PdfWriter
|
2024-12-17 18:44:58 +08:00
|
|
|
|
|
|
|
|
|
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
2024-12-18 10:32:24 +08:00
|
|
|
|
from flask_app.general.format_change import docx2pdf
|
2024-12-18 16:01:32 +08:00
|
|
|
|
import concurrent.futures
|
2024-12-18 10:32:24 +08:00
|
|
|
|
|
2024-12-17 18:44:58 +08:00
|
|
|
|
|
|
|
|
|
def get_start_and_common_header(input_path,end_page):
|
|
|
|
|
common_header = extract_common_header(input_path)
|
|
|
|
|
last_begin_index = 0
|
|
|
|
|
begin_pattern = regex.compile(
|
|
|
|
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
# 新增目录匹配模式
|
|
|
|
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
|
|
|
|
|
|
|
|
|
pdf_document = PdfReader(input_path)
|
|
|
|
|
for i, page in enumerate(pdf_document.pages):
|
|
|
|
|
if i > end_page:
|
|
|
|
|
return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0
|
|
|
|
|
text = page.extract_text()
|
|
|
|
|
if text:
|
|
|
|
|
cleaned_text = clean_page_content(text, common_header)
|
|
|
|
|
# 检查是否存在"目录"
|
|
|
|
|
if catalog_pattern.search(cleaned_text):
|
|
|
|
|
continue # 如果存在目录,跳过当前页面
|
|
|
|
|
if begin_pattern.search(cleaned_text):
|
|
|
|
|
last_begin_index = i # 更新第一个匹配的索引,页码从0开始
|
|
|
|
|
return common_header, last_begin_index
|
|
|
|
|
return common_header, last_begin_index
|
|
|
|
|
|
2024-12-18 16:01:32 +08:00
|
|
|
|
def check_pdf_pages(pdf_path,mode, logger):
|
2024-12-17 18:44:58 +08:00
|
|
|
|
try:
|
|
|
|
|
reader = PdfReader(pdf_path)
|
|
|
|
|
num_pages = len(reader.pages)
|
|
|
|
|
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
2024-12-23 15:47:41 +08:00
|
|
|
|
if num_pages <= 30:
|
2025-01-02 11:28:38 +08:00
|
|
|
|
logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。")
|
2024-12-18 16:01:32 +08:00
|
|
|
|
if mode=='goods':
|
2024-12-23 15:47:41 +08:00
|
|
|
|
return True,['', '', '', '', '', '', pdf_path,'']
|
2024-12-18 16:01:32 +08:00
|
|
|
|
else:
|
2024-12-23 15:47:41 +08:00
|
|
|
|
return True,['', '', '', '', '', pdf_path, '']
|
2025-01-02 11:28:38 +08:00
|
|
|
|
# 若页数大于30页,返回None表示继续处理
|
2024-12-18 16:01:32 +08:00
|
|
|
|
return False, []
|
2024-12-17 18:44:58 +08:00
|
|
|
|
except Exception as e:
|
|
|
|
|
logger.error(f"无法读取 PDF 页数: {e}")
|
|
|
|
|
# 返回空列表意味着无法执行后续处理逻辑
|
2024-12-18 16:01:32 +08:00
|
|
|
|
if mode == 'goods':
|
2024-12-23 15:47:41 +08:00
|
|
|
|
return True,['', '', '', '', '', '', pdf_path, '']
|
2024-12-18 16:01:32 +08:00
|
|
|
|
else:
|
2024-12-23 15:47:41 +08:00
|
|
|
|
return True,['', '', '', '', '', pdf_path, '']
|
2024-12-18 10:32:24 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header):
|
|
|
|
|
try:
|
|
|
|
|
# 检查 start_page 和 end_page 是否为 None
|
|
|
|
|
if start_page is None or end_page is None:
|
|
|
|
|
print("Error: start_page 或 end_page 为 None")
|
|
|
|
|
return ""
|
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
|
|
|
|
total_pages = len(pdf_document.pages)
|
|
|
|
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
|
|
|
|
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
|
|
|
|
|
|
|
|
|
if start_page < 0 or end_page >= total_pages or start_page > end_page:
|
|
|
|
|
print(f"无效的页面范围: {start_page} 到 {end_page}")
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
if output_suffix == 'notice' and start_page > 0:
|
|
|
|
|
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
|
|
|
|
before_doc = PdfWriter()
|
|
|
|
|
toc_page = -1
|
|
|
|
|
# 查找目录页
|
|
|
|
|
for page_num in range(min(start_page, total_pages)):
|
|
|
|
|
page_text = pdf_document.pages[page_num].extract_text()
|
|
|
|
|
cleaned_text = clean_page_content(page_text, common_header)
|
|
|
|
|
if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
|
|
|
|
|
toc_page = page_num
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# 确定截取的页数
|
|
|
|
|
pages_to_extract = toc_page + 1 if toc_page != -1 else start_page
|
|
|
|
|
# 提取页面
|
|
|
|
|
for page_num in range(pages_to_extract):
|
|
|
|
|
before_doc.add_page(pdf_document.pages[page_num])
|
|
|
|
|
print(before_pdf_path)
|
|
|
|
|
with open(before_pdf_path, 'wb') as f_before:
|
|
|
|
|
before_doc.write(f_before)
|
|
|
|
|
|
|
|
|
|
output_doc = PdfWriter()
|
|
|
|
|
for page_num in range(start_page, end_page + 1):
|
|
|
|
|
output_doc.add_page(pdf_document.pages[page_num])
|
|
|
|
|
with open(output_pdf_path, 'wb') as f:
|
|
|
|
|
output_doc.write(f)
|
|
|
|
|
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
|
|
|
|
return output_pdf_path
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error in save_extracted_pages: {e}")
|
|
|
|
|
return "" # 返回空字符串
|
|
|
|
|
|
|
|
|
|
def is_pdf_or_doc(filename):
|
|
|
|
|
# 判断文件是否为PDF或Word文档
|
|
|
|
|
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
|
|
|
|
|
2024-12-17 18:44:58 +08:00
|
|
|
|
|
2024-12-18 10:32:24 +08:00
|
|
|
|
def convert_to_pdf(file_path):
|
|
|
|
|
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
|
|
|
|
|
if file_path.lower().endswith(('.doc', '.docx')):
|
|
|
|
|
return docx2pdf(file_path)
|
|
|
|
|
return file_path
|
2024-12-17 18:44:58 +08:00
|
|
|
|
|
2024-12-18 16:01:32 +08:00
|
|
|
|
|
|
|
|
|
def get_invalid_file(file_path, output_folder, common_header,begin_page):
|
|
|
|
|
"""
|
|
|
|
|
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
|
|
|
|
begin_pattern从前往后匹配 end_pattern从后往前
|
|
|
|
|
页数小于100不进行begin_pattern,默认start_page=0
|
|
|
|
|
页数大于200时end_pattern从前往后匹配
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
file_path (str): 输入的PDF文件路径。
|
|
|
|
|
output_folder (str): 提取后文件的输出文件夹路径。
|
|
|
|
|
common_header (str): 公共头部文本,用于清理每页的内容。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
list: 包含保存的文件路径的列表,如果提取失败则返回包含空字符串的列表。
|
|
|
|
|
"""
|
|
|
|
|
# 定义开始模式
|
|
|
|
|
begin_patterns = [
|
2024-12-18 14:02:46 +08:00
|
|
|
|
regex.compile(
|
2024-12-18 16:01:32 +08:00
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
2024-12-18 14:02:46 +08:00
|
|
|
|
]
|
|
|
|
|
|
2024-12-18 16:01:32 +08:00
|
|
|
|
# 定义结束模式
|
|
|
|
|
end_patterns = [
|
|
|
|
|
regex.compile(
|
2024-12-25 14:35:52 +08:00
|
|
|
|
r'第[一二三四五六七八九十]+(?:章|部分).*?合同|[::]清标报告',
|
2024-12-18 16:01:32 +08:00
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
),
|
|
|
|
|
regex.compile(
|
|
|
|
|
r"\s*(投标文件|响应文件|响应性文件)(?:的)?格式\s*",
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# 定义排除模式
|
|
|
|
|
exclusion_pattern = regex.compile(
|
|
|
|
|
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成',
|
|
|
|
|
regex.MULTILINE
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# 打开PDF文件
|
|
|
|
|
pdf_document = PdfReader(file_path)
|
|
|
|
|
total_pages = len(pdf_document.pages)
|
2025-01-02 11:28:38 +08:00
|
|
|
|
# 新增逻辑:如果总页数 <= 50,直接返回 file_path
|
|
|
|
|
if total_pages <= 50:
|
|
|
|
|
print(f"PDF页数({total_pages}) <= 50,直接返回文件路径:{file_path}")
|
|
|
|
|
return file_path
|
2024-12-18 16:01:32 +08:00
|
|
|
|
# 提取并清理每页的文本内容
|
|
|
|
|
page_texts = []
|
|
|
|
|
for i in range(total_pages):
|
|
|
|
|
page = pdf_document.pages[i]
|
|
|
|
|
text = page.extract_text()
|
|
|
|
|
cleaned_text = clean_page_content(text, common_header) if text else ""
|
|
|
|
|
page_texts.append(cleaned_text)
|
|
|
|
|
|
|
|
|
|
# 定义查找起始页的函数
|
|
|
|
|
def find_start_page(begin_page):
|
|
|
|
|
for pattern in begin_patterns:
|
|
|
|
|
for i in range(begin_page, min(begin_page + 30, total_pages)):
|
|
|
|
|
text = page_texts[i]
|
|
|
|
|
if regex.search(exclusion_pattern, text):
|
|
|
|
|
continue
|
|
|
|
|
if regex.search(pattern, text):
|
|
|
|
|
# print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
|
|
|
|
return i
|
|
|
|
|
# print(f"未在前30页找到模式: {pattern.pattern}")
|
|
|
|
|
return 0 # 默认从第一页开始
|
|
|
|
|
|
|
|
|
|
# 定义查找结束页的函数
|
|
|
|
|
def find_end_page():
|
|
|
|
|
if total_pages > 200:
|
|
|
|
|
# print("总页数大于200页,结束页从前往后查找,跳过前30页。")
|
|
|
|
|
for pattern in end_patterns:
|
|
|
|
|
for i in range(30, total_pages):
|
|
|
|
|
text = page_texts[i]
|
|
|
|
|
if regex.search(pattern, text):
|
|
|
|
|
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
|
|
|
|
return i
|
|
|
|
|
else:
|
|
|
|
|
# print("结束页从后往前查找,确保只剩前30页时停止。")
|
|
|
|
|
for pattern in end_patterns:
|
|
|
|
|
for i in range(total_pages - 1, 29, -1):
|
|
|
|
|
text = page_texts[i]
|
|
|
|
|
if regex.search(pattern, text):
|
|
|
|
|
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
|
|
|
|
return i
|
|
|
|
|
|
|
|
|
|
# 如果没有匹配到,设置end_page逻辑
|
|
|
|
|
if total_pages > 100:
|
|
|
|
|
print("未找到结束模式,总页数大于100,设置结束页为前100页。"+file_path)
|
|
|
|
|
return max(100, int(total_pages * 2 / 3))
|
|
|
|
|
else:
|
|
|
|
|
print("未找到结束模式,设置结束页为最后一页。"+file_path)
|
|
|
|
|
return total_pages - 1
|
|
|
|
|
|
|
|
|
|
# 根据总页数决定是否查找起始页
|
|
|
|
|
if total_pages < 100:
|
|
|
|
|
# print("总页数少于100页,默认起始页为第一页。")
|
|
|
|
|
start_page = 0
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
|
|
future_end = executor.submit(find_end_page)
|
|
|
|
|
end_page = future_end.result()
|
|
|
|
|
else:
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
|
|
|
future_start = executor.submit(find_start_page,begin_page)
|
|
|
|
|
future_end = executor.submit(find_end_page)
|
|
|
|
|
start_page = future_start.result()
|
|
|
|
|
end_page = future_end.result()
|
|
|
|
|
|
|
|
|
|
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
|
|
|
|
|
|
|
|
|
|
# 验证页码范围
|
|
|
|
|
if start_page > end_page:
|
|
|
|
|
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
|
2025-01-02 11:28:38 +08:00
|
|
|
|
return ""
|
2024-12-18 16:01:32 +08:00
|
|
|
|
|
|
|
|
|
# 调用已实现的保存函数
|
|
|
|
|
output_path = save_extracted_pages(
|
|
|
|
|
pdf_path=file_path,
|
|
|
|
|
output_folder=output_folder,
|
|
|
|
|
start_page=start_page,
|
|
|
|
|
end_page=end_page,
|
|
|
|
|
output_suffix="invalid",
|
|
|
|
|
common_header=common_header
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
# print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
|
|
|
|
|
return output_path
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"处理文件 {file_path} 时发生错误: {e}")
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
file_path=r'C:\Users\Administrator\Desktop\new招标文件\货物标\HBDL-2024-0310-001-招标文件.pdf'
|
|
|
|
|
output_folder=r'C:\Users\Administrator\Desktop\new招标文件\货物标\tmp'
|
|
|
|
|
res=get_invalid_file(file_path,output_folder,"")
|
2024-12-17 18:44:58 +08:00
|
|
|
|
|