2024-12-17 14:47:19 +08:00
|
|
|
|
import os
|
2025-01-03 10:45:32 +08:00
|
|
|
|
from docx.opc.exceptions import PackageNotFoundError
|
2024-12-17 14:47:19 +08:00
|
|
|
|
from io import BytesIO
|
|
|
|
|
from PyPDF2 import PdfReader, PdfWriter
|
|
|
|
|
from docx import Document
|
|
|
|
|
from reportlab.pdfgen import canvas
|
|
|
|
|
from reportlab.lib.units import cm
|
2025-01-06 10:34:30 +08:00
|
|
|
|
import copy
|
2024-12-24 17:32:00 +08:00
|
|
|
|
|
|
|
|
|
#复制input_pdf_path的内容到invalid_added.pdf 这一步可能报错!绘制新的一页基本不会报错
|
2024-12-17 14:47:19 +08:00
|
|
|
|
def insert_mark(input_pdf_path):
|
|
|
|
|
try:
|
|
|
|
|
# 构建输出文件路径,与输入文件同目录,名称为 invalid_added.pdf
|
|
|
|
|
input_dir = os.path.dirname(input_pdf_path)
|
|
|
|
|
output_pdf_path = os.path.join(input_dir, "invalid_added.pdf")
|
|
|
|
|
|
|
|
|
|
# 打开输入的PDF文件
|
|
|
|
|
with open(input_pdf_path, 'rb') as file:
|
|
|
|
|
pdf_reader = PdfReader(file)
|
|
|
|
|
pdf_writer = PdfWriter()
|
|
|
|
|
|
|
|
|
|
total_pages = len(pdf_reader.pages)
|
|
|
|
|
|
|
|
|
|
# 遍历每一页
|
|
|
|
|
for page_num in range(total_pages):
|
2025-02-14 15:46:07 +08:00
|
|
|
|
# 添加原始页面
|
2024-12-17 14:47:19 +08:00
|
|
|
|
page = pdf_reader.pages[page_num]
|
|
|
|
|
pdf_writer.add_page(page)
|
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
# 使用 BytesIO 的上下文管理器创建内存中的 PDF 页面用于标记
|
|
|
|
|
with BytesIO() as packet:
|
|
|
|
|
# 获取当前页面的宽度和高度
|
|
|
|
|
page_width = float(page.mediabox.width)
|
|
|
|
|
page_height = float(page.mediabox.height)
|
|
|
|
|
# 使用 reportlab 创建一个新的 PDF 页面,页面大小与当前页面相同
|
|
|
|
|
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
|
|
|
|
|
|
|
|
|
|
# 计算文本位置:距左边距 2.3cm,距顶部 0.5cm
|
|
|
|
|
x_position = 2.3 * cm
|
|
|
|
|
y_position = page_height - 0.5 * cm
|
|
|
|
|
c.setFont("Helvetica", 12)
|
|
|
|
|
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
|
|
|
|
|
|
|
|
|
|
# 完成绘制,将内容写入 BytesIO 流
|
|
|
|
|
c.save()
|
|
|
|
|
|
|
|
|
|
# 重置流位置,读取刚刚生成的 PDF 页面
|
|
|
|
|
packet.seek(0)
|
|
|
|
|
new_pdf = PdfReader(packet)
|
|
|
|
|
# 使用 deepcopy 确保页面对象独立,避免后续修改影响已添加页面
|
|
|
|
|
blank_page = copy.deepcopy(new_pdf.pages[0])
|
|
|
|
|
pdf_writer.add_page(blank_page)
|
2024-12-17 14:47:19 +08:00
|
|
|
|
|
|
|
|
|
# 将所有页面写入输出的PDF文件
|
|
|
|
|
with open(output_pdf_path, 'wb') as output_file:
|
|
|
|
|
pdf_writer.write(output_file)
|
|
|
|
|
print("invalid_file added successfully!")
|
|
|
|
|
return output_pdf_path
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"发生错误: {e}")
|
2024-12-24 17:32:00 +08:00
|
|
|
|
return input_pdf_path
|
2024-12-17 14:47:19 +08:00
|
|
|
|
|
|
|
|
|
def delete_mark(docx_path):
|
2025-02-14 15:46:07 +08:00
|
|
|
|
"""
|
|
|
|
|
删除文档中包含标记文本 "[$$index_mark_" 的段落,
|
|
|
|
|
如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落,也一并删除。
|
|
|
|
|
修改后的文档保存为 invalid_del.docx,并返回新文件路径。
|
|
|
|
|
"""
|
2025-02-18 21:11:37 +08:00
|
|
|
|
if not docx_path:
|
|
|
|
|
print("Invalid input: docx_path is None or empty.")
|
|
|
|
|
return ""
|
2025-01-03 10:45:32 +08:00
|
|
|
|
try:
|
2025-02-14 15:46:07 +08:00
|
|
|
|
doc = Document(docx_path)
|
2025-01-03 10:45:32 +08:00
|
|
|
|
except KeyError as e:
|
|
|
|
|
print(f"Error opening document: {e}")
|
|
|
|
|
return ""
|
|
|
|
|
except PackageNotFoundError as e:
|
|
|
|
|
print(f"Invalid package: {e}")
|
|
|
|
|
return ""
|
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
# 收集所有需要删除的段落,避免在遍历时直接修改列表
|
|
|
|
|
paragraphs_to_remove = []
|
|
|
|
|
paragraphs = doc.paragraphs
|
|
|
|
|
i = 0
|
|
|
|
|
while i < len(paragraphs):
|
|
|
|
|
para = paragraphs[i]
|
2024-12-17 14:47:19 +08:00
|
|
|
|
if "[$$index_mark_" in para.text:
|
2025-02-14 15:46:07 +08:00
|
|
|
|
paragraphs_to_remove.append(para)
|
|
|
|
|
# 如果当前段落后面有段落且该段落的 XML 中包含 "w:sectPr",也删除该段落
|
|
|
|
|
if i + 1 < len(paragraphs):
|
|
|
|
|
next_para = paragraphs[i + 1]
|
|
|
|
|
if "w:sectPr" in next_para._element.xml:
|
|
|
|
|
paragraphs_to_remove.append(next_para)
|
|
|
|
|
i += 1 # 跳过下一个段落,避免重复处理
|
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
|
|
# 从文档 XML 中移除收集到的段落
|
|
|
|
|
for para in paragraphs_to_remove:
|
|
|
|
|
parent = para._element.getparent()
|
|
|
|
|
if parent is not None:
|
|
|
|
|
parent.remove(para._element)
|
|
|
|
|
|
|
|
|
|
new_file_path = os.path.join(os.path.dirname(docx_path), 'invalid_del.docx')
|
|
|
|
|
doc.save(new_file_path)
|
2024-12-18 11:31:47 +08:00
|
|
|
|
return new_file_path
|
2024-12-17 14:47:19 +08:00
|
|
|
|
|
2025-02-14 15:46:07 +08:00
|
|
|
|
|
|
|
|
|
|
2024-12-17 14:47:19 +08:00
|
|
|
|
if __name__ == '__main__':
|
2025-01-06 10:34:30 +08:00
|
|
|
|
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
|
2025-02-14 15:46:07 +08:00
|
|
|
|
# input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf'
|
|
|
|
|
# output=insert_mark(input)
|
2025-02-15 22:30:15 +08:00
|
|
|
|
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\invalid_added.docx'
|
2025-02-14 15:46:07 +08:00
|
|
|
|
res=delete_mark(doc_path)
|
2025-02-15 22:30:15 +08:00
|
|
|
|
if not res:
|
|
|
|
|
print("yes")
|