zbparse/flask_app/general/insert_del_pagemark.py

114 lines
4.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
from docx.opc.exceptions import PackageNotFoundError
from io import BytesIO
from PyPDF2 import PdfReader, PdfWriter
from docx import Document
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm
import copy
#复制input_pdf_path的内容到invalid_added.pdf 这一步可能报错!绘制新的一页基本不会报错
def insert_mark(input_pdf_path):
try:
# 构建输出文件路径,与输入文件同目录,名称为 invalid_added.pdf
input_dir = os.path.dirname(input_pdf_path)
output_pdf_path = os.path.join(input_dir, "invalid_added.pdf")
# 打开输入的PDF文件
with open(input_pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
pdf_writer = PdfWriter()
total_pages = len(pdf_reader.pages)
# 遍历每一页
for page_num in range(total_pages):
# 添加原始页面
page = pdf_reader.pages[page_num]
pdf_writer.add_page(page)
# 使用 BytesIO 的上下文管理器创建内存中的 PDF 页面用于标记
with BytesIO() as packet:
# 获取当前页面的宽度和高度
page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
# 使用 reportlab 创建一个新的 PDF 页面,页面大小与当前页面相同
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
# 计算文本位置:距左边距 2.3cm,距顶部 0.5cm
x_position = 2.3 * cm
y_position = page_height - 0.5 * cm
c.setFont("Helvetica", 12)
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
# 完成绘制,将内容写入 BytesIO 流
c.save()
# 重置流位置,读取刚刚生成的 PDF 页面
packet.seek(0)
new_pdf = PdfReader(packet)
# 使用 deepcopy 确保页面对象独立,避免后续修改影响已添加页面
blank_page = copy.deepcopy(new_pdf.pages[0])
pdf_writer.add_page(blank_page)
# 将所有页面写入输出的PDF文件
with open(output_pdf_path, 'wb') as output_file:
pdf_writer.write(output_file)
print("invalid_file added successfully!")
return output_pdf_path
except Exception as e:
print(f"发生错误: {e}")
return input_pdf_path
def delete_mark(docx_path):
"""
删除文档中包含标记文本 "[$$index_mark_" 的段落,
如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落,也一并删除。
修改后的文档保存为 invalid_del.docx并返回新文件路径。
"""
try:
doc = Document(docx_path)
except KeyError as e:
print(f"Error opening document: {e}")
return ""
except PackageNotFoundError as e:
print(f"Invalid package: {e}")
return ""
# 收集所有需要删除的段落,避免在遍历时直接修改列表
paragraphs_to_remove = []
paragraphs = doc.paragraphs
i = 0
while i < len(paragraphs):
para = paragraphs[i]
if "[$$index_mark_" in para.text:
paragraphs_to_remove.append(para)
# 如果当前段落后面有段落且该段落的 XML 中包含 "w:sectPr",也删除该段落
if i + 1 < len(paragraphs):
next_para = paragraphs[i + 1]
if "w:sectPr" in next_para._element.xml:
paragraphs_to_remove.append(next_para)
i += 1 # 跳过下一个段落,避免重复处理
i += 1
# 从文档 XML 中移除收集到的段落
for para in paragraphs_to_remove:
parent = para._element.getparent()
if parent is not None:
parent.remove(para._element)
new_file_path = os.path.join(os.path.dirname(docx_path), 'invalid_del.docx')
doc.save(new_file_path)
return new_file_path
if __name__ == '__main__':
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
# input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf'
# output=insert_mark(input)
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\invalid_added.docx'
res=delete_mark(doc_path)
if not res:
print("yes")