zbparse/flask_app/general/insert_del_pagemark.py

import os
import re
from io import BytesIO
from PyPDF2 import PdfReader, PdfWriter
from docx import Document
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm


#复制input_pdf_path的内容到invalid_added.pdf  这一步可能报错！绘制新的一页基本不会报错
def insert_mark(input_pdf_path):
    try:
        # 构建输出文件路径，与输入文件同目录，名称为 invalid_added.pdf
        input_dir = os.path.dirname(input_pdf_path)
        output_pdf_path = os.path.join(input_dir, "invalid_added.pdf")

        # 打开输入的PDF文件
        with open(input_pdf_path, 'rb') as file:
            pdf_reader = PdfReader(file)
            pdf_writer = PdfWriter()

            total_pages = len(pdf_reader.pages)

            # 遍历每一页
            for page_num in range(total_pages):
                page = pdf_reader.pages[page_num]
                pdf_writer.add_page(page)

                # 创建一个内存中的PDF，用于存放带有文本的空白页
                packet = BytesIO()
                # 获取当前页面的宽度和高度
                page_width = float(page.mediabox.width)
                page_height = float(page.mediabox.height)
                # 使用reportlab创建一个新的PDF页面
                c = canvas.Canvas(packet, pagesize=(page_width, page_height))

                # 计算文本的位置（单位：点，1厘米 ≈ 28.35点）
                x_position = 2.3 * cm
                y_position = page_height - 0.5 * cm  # 从顶部开始计算，因此用页面高度减去上边距

                # 绘制文本，使用 (page_num + 1) 作为索引
                c.setFont("Helvetica", 12)  # 设置字体和大小
                c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")

                # 完成绘制
                c.save()

                # 将内存中的PDF读入PyPDF2
                packet.seek(0)
                new_pdf = PdfReader(packet)
                blank_page = new_pdf.pages[0]

                # 将带有文本的空白页添加到写入器
                pdf_writer.add_page(blank_page)

            # 将所有页面写入输出的PDF文件
            with open(output_pdf_path, 'wb') as output_file:
                pdf_writer.write(output_file)
        print("invalid_file added successfully!")
        return output_pdf_path

    except Exception as e:
        print(f"发生错误: {e}")
        return input_pdf_path


def delete_mark(docx_path):
    """
    删除docx文档中的所有标记
    :param docx_path: docx文件路径
    """
    docx = Document(docx_path)
    find_flag = False
    for para in docx.paragraphs:
        # 匹配标记: [$$index_mark_X$$]
        if "[$$index_mark_" in para.text:
            para._element.getparent().remove(para._element)  # 删标记
            find_flag = True
        if find_flag and "w:sectPr" in para._element.xml:  # 删空白分节符
            para._element.getparent().remove(para._element)
            find_flag = False

    # 获取文件路径信息
    import os
    dir_path = os.path.dirname(docx_path)
    new_file_path = os.path.join(dir_path, 'invalid_del.docx')

    # 保存修改后的文档
    docx.save(new_file_path)
    return new_file_path

if __name__ == '__main__':
    input=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf'
    # input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
    output=insert_mark(input)
    # doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx'
    # delete_mark(doc_path)