zbparse/flask_app/general/insert_del_pagemark.py

import os
from docx.opc.exceptions import PackageNotFoundError
from io import BytesIO
from PyPDF2 import PdfReader, PdfWriter
from docx import Document
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm
import copy

#复制input_pdf_path的内容到invalid_added.pdf  这一步可能报错！绘制新的一页基本不会报错
def insert_mark(input_pdf_path):
    try:
        # 构建输出文件路径，与输入文件同目录，名称为 invalid_added.pdf
        input_dir = os.path.dirname(input_pdf_path)
        output_pdf_path = os.path.join(input_dir, "invalid_added.pdf")

        # 打开输入的PDF文件
        with open(input_pdf_path, 'rb') as file:
            pdf_reader = PdfReader(file)
            pdf_writer = PdfWriter()

            total_pages = len(pdf_reader.pages)

            # 遍历每一页
            for page_num in range(total_pages):
                # 添加原始页面
                page = pdf_reader.pages[page_num]
                pdf_writer.add_page(page)

                # 使用 BytesIO 的上下文管理器创建内存中的 PDF 页面用于标记
                with BytesIO() as packet:
                    # 获取当前页面的宽度和高度
                    page_width = float(page.mediabox.width)
                    page_height = float(page.mediabox.height)
                    # 使用 reportlab 创建一个新的 PDF 页面，页面大小与当前页面相同
                    c = canvas.Canvas(packet, pagesize=(page_width, page_height))

                    # 计算文本位置：距左边距 2.3cm，距顶部 0.5cm
                    x_position = 2.3 * cm
                    y_position = page_height - 0.5 * cm
                    c.setFont("Helvetica", 12)
                    c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")

                    # 完成绘制，将内容写入 BytesIO 流
                    c.save()

                    # 重置流位置，读取刚刚生成的 PDF 页面
                    packet.seek(0)
                    new_pdf = PdfReader(packet)
                    # 使用 deepcopy 确保页面对象独立，避免后续修改影响已添加页面
                    blank_page = copy.deepcopy(new_pdf.pages[0])
                    pdf_writer.add_page(blank_page)

            # 将所有页面写入输出的PDF文件
            with open(output_pdf_path, 'wb') as output_file:
                pdf_writer.write(output_file)
        print("invalid_file added successfully!")
        return output_pdf_path

    except Exception as e:
        print(f"发生错误: {e}")
        return input_pdf_path

def delete_mark(docx_path):
    """
    删除文档中包含标记文本 "[$$index_mark_" 的段落，
    如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落，也一并删除。
    修改后的文档保存为 invalid_del.docx，并返回新文件路径。
    """
    if not docx_path:
        print("Invalid input: docx_path is None or empty.")
        return ""
    try:
        doc = Document(docx_path)
    except KeyError as e:
        print(f"Error opening document: {e}")
        return ""
    except PackageNotFoundError as e:
        print(f"Invalid package: {e}")
        return ""

    # 收集所有需要删除的段落，避免在遍历时直接修改列表
    paragraphs_to_remove = []
    paragraphs = doc.paragraphs
    i = 0
    while i < len(paragraphs):
        para = paragraphs[i]
        if "[$$index_mark_" in para.text:
            paragraphs_to_remove.append(para)
            # 如果当前段落后面有段落且该段落的 XML 中包含 "w:sectPr"，也删除该段落
            if i + 1 < len(paragraphs):
                next_para = paragraphs[i + 1]
                if "w:sectPr" in next_para._element.xml:
                    paragraphs_to_remove.append(next_para)
                    i += 1  # 跳过下一个段落，避免重复处理
        i += 1

    # 从文档 XML 中移除收集到的段落
    for para in paragraphs_to_remove:
        parent = para._element.getparent()
        if parent is not None:
            parent.remove(para._element)

    new_file_path = os.path.join(os.path.dirname(docx_path), 'invalid_del.docx')
    doc.save(new_file_path)
    return new_file_path


if __name__ == '__main__':
    # input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
    # input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf'
    # output=insert_mark(input)
    doc_path = r'C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\invalid_added.docx'
    res=delete_mark(doc_path)
    if not res:
        print("yes")