2024-10-24 15:52:35 +08:00
|
|
|
|
from PyPDF2 import PdfReader, PdfWriter
|
|
|
|
|
import re # 导入正则表达式库
|
|
|
|
|
import os # 用于文件和文件夹操作
|
|
|
|
|
|
|
|
|
|
def clean_page_content(text, common_header):
|
|
|
|
|
"""
|
|
|
|
|
清理页面内容,通过删除公共抬头和页码。
|
|
|
|
|
"""
|
|
|
|
|
if common_header:
|
|
|
|
|
for header_line in common_header.split('\n'):
|
|
|
|
|
if header_line.strip():
|
|
|
|
|
# 替换首次出现的完整行
|
|
|
|
|
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
|
|
|
|
|
2024-10-24 20:51:48 +08:00
|
|
|
|
# 删除页码,支持多种格式
|
|
|
|
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
|
|
|
|
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
|
|
|
|
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
|
|
|
|
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
2024-10-24 15:52:35 +08:00
|
|
|
|
return text
|
2024-10-24 20:51:48 +08:00
|
|
|
|
|
|
|
|
|
|
2024-10-24 15:52:35 +08:00
|
|
|
|
def extract_common_header(pdf_path):
|
|
|
|
|
"""
|
|
|
|
|
从PDF的前几页提取公共抬头。
|
|
|
|
|
"""
|
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
|
|
|
|
headers = []
|
|
|
|
|
total_pages = len(pdf_document.pages)
|
|
|
|
|
|
|
|
|
|
# 确定要读取的页数和起始页
|
|
|
|
|
pages_to_read = 2 if total_pages == 2 else 3
|
|
|
|
|
start_page = 0 if total_pages == 2 else max(0, (total_pages // 2) - 1)
|
|
|
|
|
|
|
|
|
|
for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
|
|
|
|
page = pdf_document.pages[i]
|
|
|
|
|
text = page.extract_text() or ""
|
|
|
|
|
if text:
|
|
|
|
|
# 只取每页的前三行
|
|
|
|
|
first_lines = text.strip().split('\n')[:3]
|
2024-10-24 20:51:48 +08:00
|
|
|
|
headers.append(first_lines)
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
|
|
|
|
if len(headers) < 2:
|
|
|
|
|
return "" # 如果没有足够的页来比较,返回空字符串
|
|
|
|
|
|
|
|
|
|
# 寻找每一行中的公共部分,按顺序保留
|
2024-10-24 20:51:48 +08:00
|
|
|
|
common_headers = []
|
|
|
|
|
for lines in zip(*headers):
|
|
|
|
|
first_words = lines[0].split()
|
|
|
|
|
common_line = [word for word in first_words if all(word in line.split() for line in lines[1:])]
|
|
|
|
|
if common_line:
|
|
|
|
|
common_headers.append(' '.join(common_line))
|
|
|
|
|
|
2024-10-24 15:52:35 +08:00
|
|
|
|
return '\n'.join(common_headers)
|
|
|
|
|
|
2024-10-24 16:55:39 +08:00
|
|
|
|
|
2024-10-24 15:52:35 +08:00
|
|
|
|
def is_pdf_or_doc(filename):
|
|
|
|
|
"""
|
|
|
|
|
判断文件是否为PDF或Word文档。
|
|
|
|
|
"""
|
|
|
|
|
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
|
|
|
|
|
|
2024-10-24 16:55:39 +08:00
|
|
|
|
|
|
|
|
|
# def convert_to_pdf(file_path):
|
|
|
|
|
# """
|
|
|
|
|
# 如果是Word文档,则转换为PDF。
|
|
|
|
|
# """
|
|
|
|
|
# if file_path.lower().endswith(('.doc', '.docx')):
|
|
|
|
|
# return docx2pdf(file_path)
|
|
|
|
|
# return file_path
|
|
|
|
|
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
|
|
|
|
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
|
|
|
|
"""
|
2024-10-24 20:51:48 +08:00
|
|
|
|
将提取的页面保存为新的PDF文件,统一命名为 "bid_format.pdf" 并存放在独立的子文件夹中。
|
2024-10-24 15:52:35 +08:00
|
|
|
|
"""
|
|
|
|
|
try:
|
2024-10-24 20:51:48 +08:00
|
|
|
|
output_pdf_path = os.path.join(output_folder, "bid_format.pdf")
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
2024-10-24 20:51:48 +08:00
|
|
|
|
# 检查 start_page 和 end_page 是否为 None
|
|
|
|
|
if start_page is None or end_page is None:
|
|
|
|
|
print(f"起始页或结束页为 None: start_page={start_page}, end_page={end_page} 文件: {pdf_path}")
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
if start_page < 0 or end_page >= len(pdf_document.pages) or start_page > end_page:
|
2024-10-24 16:55:39 +08:00
|
|
|
|
print(f"无效的页面范围: {start_page} 到 {end_page} 文件: {pdf_path}")
|
2024-10-24 15:52:35 +08:00
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
output_doc = PdfWriter()
|
|
|
|
|
for page_num in range(start_page, end_page + 1):
|
|
|
|
|
output_doc.add_page(pdf_document.pages[page_num])
|
|
|
|
|
with open(output_pdf_path, 'wb') as f:
|
|
|
|
|
output_doc.write(f)
|
|
|
|
|
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
|
|
|
|
return output_pdf_path
|
|
|
|
|
except Exception as e:
|
2024-10-24 16:55:39 +08:00
|
|
|
|
print(f"Error in save_extracted_pages for file {pdf_path}: {e}")
|
2024-10-24 15:52:35 +08:00
|
|
|
|
return "" # 返回空字符串
|
|
|
|
|
|
2024-10-24 16:55:39 +08:00
|
|
|
|
|
2024-10-25 10:18:38 +08:00
|
|
|
|
def extract_pages_generic(pdf_document, begin_pattern, begin_page, common_header, total_pages):
|
2024-10-24 15:52:35 +08:00
|
|
|
|
"""
|
2024-10-25 10:18:38 +08:00
|
|
|
|
通用函数,根据开始模式提取起始页,并将结束页设为文件末尾。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- pdf_document: PdfReader 对象。
|
|
|
|
|
- begin_pattern: 用于匹配起始页的正则模式。
|
|
|
|
|
- begin_page: 起始页的页面索引下限。
|
|
|
|
|
- common_header: 公共页头,用于清理页面内容。
|
|
|
|
|
- total_pages: PDF文档的总页数(索引从0开始)。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- start_page: 匹配到的起始页的页面索引。
|
|
|
|
|
- end_page: 结束页的页面索引(设为文件末尾)。
|
2024-10-24 15:52:35 +08:00
|
|
|
|
"""
|
|
|
|
|
start_page = None
|
2024-10-25 10:18:38 +08:00
|
|
|
|
end_page = total_pages # 结束页设为文件末尾
|
|
|
|
|
|
|
|
|
|
for i in range(total_pages, -1, -1):
|
|
|
|
|
page = pdf_document.pages[i]
|
2024-10-24 15:52:35 +08:00
|
|
|
|
text = page.extract_text() or ""
|
|
|
|
|
cleaned_text = clean_page_content(text, common_header)
|
2024-10-25 10:18:38 +08:00
|
|
|
|
# print(cleaned_text) # 可根据需要取消注释以调试
|
|
|
|
|
|
|
|
|
|
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
2024-10-24 15:52:35 +08:00
|
|
|
|
start_page = i
|
2024-10-25 10:18:38 +08:00
|
|
|
|
print(f"找到起始页: 页码 {i + 1}") # 页码通常从1开始
|
|
|
|
|
break # 找到起始页后退出循环
|
|
|
|
|
|
2024-10-24 20:51:48 +08:00
|
|
|
|
return start_page, end_page
|
2024-10-24 16:55:39 +08:00
|
|
|
|
|
2024-10-25 10:18:38 +08:00
|
|
|
|
|
2024-10-24 20:51:48 +08:00
|
|
|
|
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
2024-10-25 10:18:38 +08:00
|
|
|
|
"""
|
|
|
|
|
二次提取PDF页面,仅根据开始模式从后往前提取。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- pdf_path: PDF文件的路径。
|
|
|
|
|
- output_folder: 提取后页面的输出文件夹。
|
|
|
|
|
- output_suffix: 输出文件的后缀名。
|
|
|
|
|
- common_header: 公共页头,用于清理页面内容。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- 提取的页面保存结果,或空字符串表示失败。
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
|
|
|
|
total_pages = len(pdf_document.pages) - 1 # 获取总页数(索引从0开始)
|
|
|
|
|
begin_page = 10 # 起始页的页面索引下限
|
|
|
|
|
|
|
|
|
|
start_page = None
|
|
|
|
|
end_page = total_pages # 结束页设为文件末尾
|
|
|
|
|
|
|
|
|
|
# 定义开始模式列表
|
|
|
|
|
begin_patterns = [
|
|
|
|
|
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', re.MULTILINE),
|
|
|
|
|
re.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*")
|
|
|
|
|
]
|
|
|
|
|
# 定义排除模式
|
|
|
|
|
exclusion_pattern = re.compile(r'文件的构成|文件的组成|目录|评标办法|技术标准|其他材料|合同')
|
|
|
|
|
|
|
|
|
|
# 遍历每个开始模式
|
|
|
|
|
for begin_pattern in begin_patterns:
|
|
|
|
|
# 从后往前遍历页面
|
|
|
|
|
for i in range(total_pages, -1, -1):
|
|
|
|
|
page = pdf_document.pages[i]
|
|
|
|
|
text = page.extract_text() or ""
|
|
|
|
|
cleaned_text = clean_page_content(text, common_header)
|
|
|
|
|
|
|
|
|
|
# 如果页面内容匹配排除模式,则跳过
|
|
|
|
|
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
# 检查是否匹配开始模式,并且页面索引大于 begin_page
|
|
|
|
|
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
|
|
|
|
start_page = i
|
|
|
|
|
print(f"二次提取找到起始页: 页码 {i + 1}") # 页码通常从1开始
|
|
|
|
|
break # 找到起始页后退出循环
|
|
|
|
|
|
|
|
|
|
if start_page is not None:
|
|
|
|
|
break # 如果找到起始页,退出外层循环
|
|
|
|
|
|
|
|
|
|
if start_page is None:
|
|
|
|
|
print(f"{output_suffix}: 未找到起始页,二次提取失败! 文件: {pdf_path}")
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
# end_page 已经设为文件末尾
|
|
|
|
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error processing {pdf_path} in extract_pages_twice: {e}")
|
2024-10-24 20:51:48 +08:00
|
|
|
|
return ""
|
2024-10-25 10:18:38 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, output_suffix):
|
2024-10-24 15:52:35 +08:00
|
|
|
|
"""
|
2024-10-25 10:18:38 +08:00
|
|
|
|
根据开始模式从后往前提取PDF页面。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- pdf_path: PDF文件的路径。
|
|
|
|
|
- output_folder: 提取后页面的输出文件夹。
|
|
|
|
|
- begin_pattern: 用于匹配起始页的正则模式。
|
|
|
|
|
- begin_page: 起始页的页面索引下限。
|
|
|
|
|
- output_suffix: 输出文件的后缀名。
|
2024-10-24 15:52:35 +08:00
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
common_header = extract_common_header(pdf_path)
|
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
2024-10-25 10:18:38 +08:00
|
|
|
|
total_pages = len(pdf_document.pages) - 1 # 获取总页数(索引从0开始)
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
2024-10-25 10:18:38 +08:00
|
|
|
|
# 提取起始页,结束页直接设为文件末尾
|
|
|
|
|
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, begin_page, common_header,
|
|
|
|
|
total_pages)
|
2024-10-24 16:55:39 +08:00
|
|
|
|
|
2024-10-24 20:51:48 +08:00
|
|
|
|
if output_suffix == "format":
|
|
|
|
|
if start_page is None:
|
|
|
|
|
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path} 尝试二次提取!")
|
2024-10-25 10:18:38 +08:00
|
|
|
|
return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header)
|
|
|
|
|
# end_page 已经设为文件末尾
|
2024-10-24 20:51:48 +08:00
|
|
|
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
2024-10-25 10:18:38 +08:00
|
|
|
|
|
|
|
|
|
# 如果 output_suffix 不是 "format",同样保存提取的页面
|
2024-10-24 15:52:35 +08:00
|
|
|
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
2024-10-25 10:18:38 +08:00
|
|
|
|
|
2024-10-24 15:52:35 +08:00
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error processing {pdf_path}: {e}")
|
|
|
|
|
return ""
|
|
|
|
|
|
2024-10-24 16:55:39 +08:00
|
|
|
|
|
2024-10-25 10:18:38 +08:00
|
|
|
|
def process_files(file_path, output_folder, begin_pattern, begin_page, output_suffix):
|
2024-10-24 15:52:35 +08:00
|
|
|
|
"""
|
|
|
|
|
处理单个文件:转换为PDF(如果需要)并提取页面。
|
|
|
|
|
"""
|
2024-10-24 16:55:39 +08:00
|
|
|
|
# pdf_path = convert_to_pdf(file_path)
|
|
|
|
|
pdf_path=file_path
|
2024-10-25 10:18:38 +08:00
|
|
|
|
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, output_suffix)
|
2024-10-24 20:51:48 +08:00
|
|
|
|
return result or ""
|
2024-10-24 16:55:39 +08:00
|
|
|
|
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
2024-10-25 10:18:38 +08:00
|
|
|
|
def process_input(input_path, output_folder, begin_pattern, begin_page,output_suffix):
|
2024-10-24 15:52:35 +08:00
|
|
|
|
"""
|
|
|
|
|
处理输入路径,可以是文件或目录。
|
|
|
|
|
"""
|
|
|
|
|
if not os.path.exists(output_folder):
|
|
|
|
|
os.makedirs(output_folder)
|
|
|
|
|
generated_files = []
|
|
|
|
|
|
|
|
|
|
if os.path.isdir(input_path):
|
|
|
|
|
for file_name in os.listdir(input_path):
|
|
|
|
|
file_path = os.path.join(input_path, file_name)
|
|
|
|
|
if is_pdf_or_doc(file_path):
|
2024-10-25 10:18:38 +08:00
|
|
|
|
result = process_files(file_path, output_folder, begin_pattern, begin_page, output_suffix)
|
2024-10-24 20:51:48 +08:00
|
|
|
|
if isinstance(result, tuple):
|
|
|
|
|
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
|
|
|
|
|
else:
|
|
|
|
|
generated_files.append(result) # 直接添加result,可能是空字符串
|
2024-10-24 15:52:35 +08:00
|
|
|
|
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
|
2024-10-25 10:18:38 +08:00
|
|
|
|
result = process_files(input_path, output_folder, begin_pattern, begin_page, output_suffix)
|
2024-10-24 20:51:48 +08:00
|
|
|
|
if isinstance(result, tuple):
|
|
|
|
|
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
|
|
|
|
|
else:
|
|
|
|
|
generated_files.append(result) # 直接添加result,可能是空字符串
|
2024-10-24 15:52:35 +08:00
|
|
|
|
else:
|
2024-10-24 20:51:48 +08:00
|
|
|
|
print("提供的路径既不是文件夹也不是PDF文件。")
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
2024-10-24 20:51:48 +08:00
|
|
|
|
return generated_files
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
2024-10-24 20:51:48 +08:00
|
|
|
|
|
|
|
|
|
def truncate_pdf_main(input_path, output_folder, selection,begin_page=10):
|
2024-10-24 15:52:35 +08:00
|
|
|
|
"""
|
|
|
|
|
主函数,根据选择截取PDF内容。
|
|
|
|
|
"""
|
|
|
|
|
try:
|
|
|
|
|
if selection == 1: # 投标文件格式
|
|
|
|
|
begin_pattern = re.compile(
|
2024-10-24 20:51:48 +08:00
|
|
|
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*'
|
2024-10-24 15:52:35 +08:00
|
|
|
|
)
|
2024-10-25 10:18:38 +08:00
|
|
|
|
# end_pattern = re.compile(
|
|
|
|
|
# r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
|
|
|
|
|
# )
|
2024-10-24 15:52:35 +08:00
|
|
|
|
local_output_suffix = "format"
|
|
|
|
|
else:
|
|
|
|
|
print("无效的选择:请选择1")
|
2024-10-24 20:51:48 +08:00
|
|
|
|
return None
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
|
|
|
|
# 调用相应的处理函数
|
2024-10-25 10:18:38 +08:00
|
|
|
|
return process_input(input_path, output_folder, begin_pattern, begin_page, local_output_suffix) or ""
|
2024-10-24 15:52:35 +08:00
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"Error in truncate_pdf_main: {e}")
|
2024-10-24 20:51:48 +08:00
|
|
|
|
return "" # 返回空字符串
|
2024-10-24 16:55:39 +08:00
|
|
|
|
|
2024-10-24 15:52:35 +08:00
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
# 定义输入和输出路径
|
2024-10-29 09:04:23 +08:00
|
|
|
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
|
|
|
|
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
|
2024-10-24 15:52:35 +08:00
|
|
|
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
|
|
|
|
selection = 1 # 1 - 投标文件格式
|
|
|
|
|
# 执行截取
|
|
|
|
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
2024-10-24 20:51:48 +08:00
|
|
|
|
print("生成的文件:", generated_files)
|