zbparse/flask_app/testdir/截取文件格式.py
2024-10-24 16:55:39 +08:00

220 lines
8.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
# from flask_app.general.format_change import docx2pdf # 确保此模块可用
def clean_page_content(text, common_header):
"""
清理页面内容,通过删除公共抬头和页码。
"""
if common_header:
for header_line in common_header.split('\n'):
if header_line.strip():
# 替换首次出现的完整行
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
# 删除页码,合并多个正则表达式为一个
text = re.sub(
r'(^\s*\d+\s*(?=\D))|(\s+\d+\s*$)|(\s*/\s*\d+\s*)|(\s*[—-]\s*\d+\s*[—-]\s*)',
'',
text,
flags=re.MULTILINE
)
return text
def extract_common_header(pdf_path):
"""
从PDF的前几页提取公共抬头。
"""
pdf_document = PdfReader(pdf_path)
headers = []
total_pages = len(pdf_document.pages)
# 确定要读取的页数和起始页
pages_to_read = 2 if total_pages == 2 else 3
start_page = 0 if total_pages == 2 else max(0, (total_pages // 2) - 1)
for i in range(start_page, min(start_page + pages_to_read, total_pages)):
page = pdf_document.pages[i]
text = page.extract_text() or ""
if text:
# 只取每页的前三行
first_lines = text.strip().split('\n')[:3]
headers.append(set(line.strip() for line in first_lines if line.strip()))
if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串
# 寻找每一行中的公共部分,按顺序保留
common_headers = set.intersection(*headers)
return '\n'.join(common_headers)
def is_pdf_or_doc(filename):
"""
判断文件是否为PDF或Word文档。
"""
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
# def convert_to_pdf(file_path):
# """
# 如果是Word文档则转换为PDF。
# """
# if file_path.lower().endswith(('.doc', '.docx')):
# return docx2pdf(file_path)
# return file_path
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
"""
将提取的页面保存为新的PDF文件。
"""
try:
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
if start_page is None or end_page is None or start_page > end_page:
print(f"无效的页面范围: {start_page}{end_page} 文件: {pdf_path}")
return ""
output_doc = PdfWriter()
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"{output_suffix} 已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
except Exception as e:
print(f"Error in save_extracted_pages for file {pdf_path}: {e}")
return "" # 返回空字符串
def find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
"""
查找起始页和结束页的索引。
"""
start_page = None
end_page = None
total_pages = len(pdf_document.pages)
for i, page in enumerate(pdf_document.pages):
if i <= begin_page:
continue
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if start_page is None and re.search(begin_pattern, cleaned_text):
start_page = i
continue
if start_page is not None and re.search(end_pattern, cleaned_text):
end_page = i
break
return start_page, end_page if end_page else total_pages - 1
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
"""
根据开始和结束模式提取PDF页面。
"""
try:
common_header = extract_common_header(pdf_path)
pdf_document = PdfReader(pdf_path)
start_page, end_page = find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
if output_suffix == "format" and start_page is None:
print(f"{output_suffix}: 未找到起始页,尝试二次提取! 文件: {pdf_path}")
# 二次提取逻辑,保留原有功能
start_page, end_page = find_page_indices(
pdf_document,
re.compile(r'^(?:响应|投标).*?格式.*', re.MULTILINE),
end_pattern,
begin_page,
common_header
)
if start_page is None:
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}")
return ""
if start_page is None:
print(f"未找到起始页,提取失败! 文件: {pdf_path}")
return ""
if end_page is None:
end_page = len(pdf_document.pages) - 1
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}")
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return ""
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
"""
处理单个文件转换为PDF如果需要并提取页面。
"""
# pdf_path = convert_to_pdf(file_path)
pdf_path=file_path
return extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or ""
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
"""
处理输入路径,可以是文件或目录。
"""
if not os.path.exists(output_folder):
os.makedirs(output_folder)
generated_files = []
if os.path.isdir(input_path):
for file_name in os.listdir(input_path):
file_path = os.path.join(input_path, file_name)
if is_pdf_or_doc(file_path):
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
generated_files.append(result)
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
generated_files.append(result)
else:
print("提供的路径既不是文件夹也不是PDF/Word文件。")
return [f for f in generated_files if f] # 过滤空字符串
def truncate_pdf_main(input_path, output_folder, selection):
"""
主函数根据选择截取PDF内容。
"""
try:
if selection == 1: # 投标文件格式
begin_page = 10
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
re.MULTILINE
)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
)
local_output_suffix = "format"
else:
print("无效的选择:请选择1")
return []
# 调用相应的处理函数
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix)
except Exception as e:
print(f"Error in truncate_pdf_main: {e}")
return []
if __name__ == "__main__":
# 定义输入和输出路径
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
selection = 1 # 1 - 投标文件格式
# 执行截取
generated_files = truncate_pdf_main(input_path, output_folder, selection)
print("生成的文件:", generated_files)