zbparse/flask_app/货物标/货物标截取pdf.py
2024-08-29 16:37:09 +08:00

106 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
def clean_page_numbers(text):
# 使用正则表达式删除页码
# 假设页码在文本的最开始,紧跟着文字且无空格分隔
cleaned_text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
# 删除结尾的页码
cleaned_text = re.sub(r'\s+\d+\s*$', '', cleaned_text)
# 删除形如 /129 的页码
cleaned_text = re.sub(r'\s*\/\s*\d+\s*', '', cleaned_text)
return cleaned_text
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
# 打开PDF文件
pdf_document = PdfReader(pdf_path)
start_page = None
end_page = None
# 遍历文档的每一页,查找开始和结束短语的位置
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text()
if text:
cleaned_text = clean_page_numbers(text)
if re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > (start_page+1):
end_page = i
break
# 确保找到了起始和结束页面
if start_page is None or end_page is None:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
return None
# 创建一个新的PDF文档保存截取的页面
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # Get the base name without extension
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
output_doc = PdfWriter()
# 添加需要的页面,从 start_page 开始,包括 end_page
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
# 保存新的PDF文件
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
# 确保输出文件夹存在
if not os.path.exists(output_folder):
os.makedirs(output_folder)
if os.path.isdir(input_path):
generated_files = []
# 遍历文件夹内的所有PDF文件
for file in os.listdir(input_path):
if file.endswith(".pdf"):
pdf_path = os.path.join(input_path, file)
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path):
generated_files.append(output_pdf_path)
return generated_files
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
# 处理单个PDF文件
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if output_pdf_path and os.path.isfile(output_pdf_path):
return [output_pdf_path] # 以列表形式返回,以保持一致性
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return []
def truncate_pdf_main(input_path, output_folder, selection):
if selection == 1:
# Configure patterns and phrases for "投标人须知前附表"
begin_pattern = re.compile(r'第[一二三四五六七八九十百千]+章.*?(?:项目|服务|商务).*?要求')
begin_page = 5
end_pattern = re.compile(r'第[一二三四五六七八九十百千]+章\s*(资格审查|评标方法|评审办法)')
# 示例文本进行测试
output_suffix = "tobidders_notice_table"
else:
print("无效的选择")
return None
# Process the selected input
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
def truncate_pdf_multiple(input_path, output_folder):
truncate_files = []
for selection in range(1, 2):
files = truncate_pdf_main(input_path, output_folder, selection)
truncate_files.extend(files)
return truncate_files
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\WPSDrive\\292826853\\WPS云盘\\应用\\输出为PDF\\公安局音视频监控系统设备采购项目(定稿_20240829133603.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
# truncate_pdf_multiple(input_path,output_folder)
selection = 1 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print("生成的文件:", generated_files)