zbparse/flask_app/testdir/截取文件格式.py

220 lines
8.1 KiB
Python
Raw Normal View History

2024-10-24 15:52:35 +08:00
from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
2024-10-24 16:55:39 +08:00
# from flask_app.general.format_change import docx2pdf # 确保此模块可用
2024-10-24 15:52:35 +08:00
def clean_page_content(text, common_header):
"""
清理页面内容通过删除公共抬头和页码
"""
if common_header:
for header_line in common_header.split('\n'):
if header_line.strip():
# 替换首次出现的完整行
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
2024-10-24 16:55:39 +08:00
# 删除页码,合并多个正则表达式为一个
text = re.sub(
r'(^\s*\d+\s*(?=\D))|(\s+\d+\s*$)|(\s*/\s*\d+\s*)|(\s*[—-]\s*\d+\s*[—-]\s*)',
'',
text,
flags=re.MULTILINE
)
2024-10-24 15:52:35 +08:00
return text
def extract_common_header(pdf_path):
"""
从PDF的前几页提取公共抬头
"""
pdf_document = PdfReader(pdf_path)
headers = []
total_pages = len(pdf_document.pages)
# 确定要读取的页数和起始页
pages_to_read = 2 if total_pages == 2 else 3
start_page = 0 if total_pages == 2 else max(0, (total_pages // 2) - 1)
for i in range(start_page, min(start_page + pages_to_read, total_pages)):
page = pdf_document.pages[i]
text = page.extract_text() or ""
if text:
# 只取每页的前三行
first_lines = text.strip().split('\n')[:3]
2024-10-24 16:55:39 +08:00
headers.append(set(line.strip() for line in first_lines if line.strip()))
2024-10-24 15:52:35 +08:00
if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串
# 寻找每一行中的公共部分,按顺序保留
2024-10-24 16:55:39 +08:00
common_headers = set.intersection(*headers)
2024-10-24 15:52:35 +08:00
return '\n'.join(common_headers)
2024-10-24 16:55:39 +08:00
2024-10-24 15:52:35 +08:00
def is_pdf_or_doc(filename):
"""
判断文件是否为PDF或Word文档
"""
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
2024-10-24 16:55:39 +08:00
# def convert_to_pdf(file_path):
# """
# 如果是Word文档则转换为PDF。
# """
# if file_path.lower().endswith(('.doc', '.docx')):
# return docx2pdf(file_path)
# return file_path
2024-10-24 15:52:35 +08:00
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
"""
将提取的页面保存为新的PDF文件
"""
try:
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
2024-10-24 16:55:39 +08:00
if start_page is None or end_page is None or start_page > end_page:
print(f"无效的页面范围: {start_page}{end_page} 文件: {pdf_path}")
2024-10-24 15:52:35 +08:00
return ""
output_doc = PdfWriter()
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"{output_suffix} 已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
except Exception as e:
2024-10-24 16:55:39 +08:00
print(f"Error in save_extracted_pages for file {pdf_path}: {e}")
2024-10-24 15:52:35 +08:00
return "" # 返回空字符串
2024-10-24 16:55:39 +08:00
def find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
2024-10-24 15:52:35 +08:00
"""
2024-10-24 16:55:39 +08:00
查找起始页和结束页的索引
2024-10-24 15:52:35 +08:00
"""
start_page = None
end_page = None
2024-10-24 16:55:39 +08:00
total_pages = len(pdf_document.pages)
2024-10-24 15:52:35 +08:00
for i, page in enumerate(pdf_document.pages):
2024-10-24 16:55:39 +08:00
if i <= begin_page:
continue
2024-10-24 15:52:35 +08:00
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
2024-10-24 16:55:39 +08:00
if start_page is None and re.search(begin_pattern, cleaned_text):
2024-10-24 15:52:35 +08:00
start_page = i
2024-10-24 16:55:39 +08:00
continue
if start_page is not None and re.search(end_pattern, cleaned_text):
2024-10-24 15:52:35 +08:00
end_page = i
break
2024-10-24 16:55:39 +08:00
return start_page, end_page if end_page else total_pages - 1
2024-10-24 15:52:35 +08:00
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
"""
根据开始和结束模式提取PDF页面
"""
try:
common_header = extract_common_header(pdf_path)
pdf_document = PdfReader(pdf_path)
2024-10-24 16:55:39 +08:00
start_page, end_page = find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
if output_suffix == "format" and start_page is None:
print(f"{output_suffix}: 未找到起始页,尝试二次提取! 文件: {pdf_path}")
# 二次提取逻辑,保留原有功能
start_page, end_page = find_page_indices(
pdf_document,
re.compile(r'^(?:响应|投标).*?格式.*', re.MULTILINE),
end_pattern,
begin_page,
common_header
)
2024-10-24 15:52:35 +08:00
if start_page is None:
2024-10-24 16:55:39 +08:00
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}")
2024-10-24 15:52:35 +08:00
return ""
2024-10-24 16:55:39 +08:00
if start_page is None:
print(f"未找到起始页,提取失败! 文件: {pdf_path}")
2024-10-24 15:52:35 +08:00
return ""
2024-10-24 16:55:39 +08:00
if end_page is None:
end_page = len(pdf_document.pages) - 1
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}")
2024-10-24 15:52:35 +08:00
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return ""
2024-10-24 16:55:39 +08:00
2024-10-24 15:52:35 +08:00
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
"""
处理单个文件转换为PDF如果需要并提取页面
"""
2024-10-24 16:55:39 +08:00
# pdf_path = convert_to_pdf(file_path)
pdf_path=file_path
return extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or ""
2024-10-24 15:52:35 +08:00
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
"""
处理输入路径可以是文件或目录
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
generated_files = []
if os.path.isdir(input_path):
for file_name in os.listdir(input_path):
file_path = os.path.join(input_path, file_name)
if is_pdf_or_doc(file_path):
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
2024-10-24 16:55:39 +08:00
generated_files.append(result)
2024-10-24 15:52:35 +08:00
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
2024-10-24 16:55:39 +08:00
generated_files.append(result)
2024-10-24 15:52:35 +08:00
else:
2024-10-24 16:55:39 +08:00
print("提供的路径既不是文件夹也不是PDF/Word文件。")
return [f for f in generated_files if f] # 过滤空字符串
2024-10-24 15:52:35 +08:00
def truncate_pdf_main(input_path, output_folder, selection):
"""
主函数根据选择截取PDF内容
"""
try:
if selection == 1: # 投标文件格式
2024-10-24 16:55:39 +08:00
begin_page = 10
2024-10-24 15:52:35 +08:00
begin_pattern = re.compile(
2024-10-24 16:55:39 +08:00
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
re.MULTILINE
2024-10-24 15:52:35 +08:00
)
end_pattern = re.compile(
2024-10-24 16:55:39 +08:00
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
2024-10-24 15:52:35 +08:00
)
local_output_suffix = "format"
else:
print("无效的选择:请选择1")
2024-10-24 16:55:39 +08:00
return []
2024-10-24 15:52:35 +08:00
# 调用相应的处理函数
2024-10-24 16:55:39 +08:00
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix)
2024-10-24 15:52:35 +08:00
except Exception as e:
print(f"Error in truncate_pdf_main: {e}")
2024-10-24 16:55:39 +08:00
return []
2024-10-24 15:52:35 +08:00
if __name__ == "__main__":
# 定义输入和输出路径
2024-10-24 16:55:39 +08:00
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
2024-10-24 15:52:35 +08:00
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
selection = 1 # 1 - 投标文件格式
# 执行截取
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print("生成的文件:", generated_files)