zbparse/flask_app/general/insert_del_pagemark.py

103 lines
3.9 KiB
Python
Raw Permalink Normal View History

2024-12-17 14:47:19 +08:00
import os
2025-01-03 10:45:32 +08:00
from docx.opc.exceptions import PackageNotFoundError
2024-12-17 14:47:19 +08:00
from io import BytesIO
from PyPDF2 import PdfReader, PdfWriter
from docx import Document
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm
2024-12-24 17:32:00 +08:00
#复制input_pdf_path的内容到invalid_added.pdf 这一步可能报错!绘制新的一页基本不会报错
2024-12-17 14:47:19 +08:00
def insert_mark(input_pdf_path):
try:
# 构建输出文件路径,与输入文件同目录,名称为 invalid_added.pdf
input_dir = os.path.dirname(input_pdf_path)
output_pdf_path = os.path.join(input_dir, "invalid_added.pdf")
# 打开输入的PDF文件
with open(input_pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
pdf_writer = PdfWriter()
total_pages = len(pdf_reader.pages)
# 遍历每一页
for page_num in range(total_pages):
page = pdf_reader.pages[page_num]
pdf_writer.add_page(page)
# 创建一个内存中的PDF用于存放带有文本的空白页
packet = BytesIO()
# 获取当前页面的宽度和高度
page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
# 使用reportlab创建一个新的PDF页面
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
# 计算文本的位置单位1厘米 ≈ 28.35点)
x_position = 2.3 * cm
y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距
# 绘制文本,使用 (page_num + 1) 作为索引
c.setFont("Helvetica", 12) # 设置字体和大小
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
# 完成绘制
c.save()
# 将内存中的PDF读入PyPDF2
packet.seek(0)
new_pdf = PdfReader(packet)
blank_page = new_pdf.pages[0]
# 将带有文本的空白页添加到写入器
pdf_writer.add_page(blank_page)
# 将所有页面写入输出的PDF文件
with open(output_pdf_path, 'wb') as output_file:
pdf_writer.write(output_file)
print("invalid_file added successfully!")
return output_pdf_path
except Exception as e:
print(f"发生错误: {e}")
2024-12-24 17:32:00 +08:00
return input_pdf_path
2024-12-17 14:47:19 +08:00
def delete_mark(docx_path):
2025-01-03 10:45:32 +08:00
try:
docx = Document(docx_path)
except KeyError as e:
print(f"Error opening document: {e}")
return ""
except PackageNotFoundError as e:
print(f"Invalid package: {e}")
return ""
# 继续处理文档
2024-12-17 14:47:19 +08:00
find_flag = False
for para in docx.paragraphs:
# 匹配标记: [$$index_mark_X$$]
if "[$$index_mark_" in para.text:
para._element.getparent().remove(para._element) # 删标记
find_flag = True
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
para._element.getparent().remove(para._element)
find_flag = False
dir_path = os.path.dirname(docx_path)
new_file_path = os.path.join(dir_path, 'invalid_del.docx')
# 保存修改后的文档
docx.save(new_file_path)
2024-12-18 11:31:47 +08:00
return new_file_path
2024-12-17 14:47:19 +08:00
if __name__ == '__main__':
2025-01-03 10:45:32 +08:00
# input=r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
2024-12-24 17:32:00 +08:00
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
2025-01-03 10:45:32 +08:00
# output=insert_mark(input)
doc_path = r'D:\flask_project\flask_app\static\output\output1\90b073b5-b8fb-4b11-9962-10de7c3f3854\invalid_added.docx'
res=delete_mark(doc_path)
if res:
print(res)
else:
print("No")