zbparse/flask_app/general/insert_del_pagemark.py
2024-12-24 17:32:00 +08:00

97 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
from io import BytesIO
from PyPDF2 import PdfReader, PdfWriter
from docx import Document
from reportlab.pdfgen import canvas
from reportlab.lib.units import cm
#复制input_pdf_path的内容到invalid_added.pdf 这一步可能报错!绘制新的一页基本不会报错
def insert_mark(input_pdf_path):
try:
# 构建输出文件路径,与输入文件同目录,名称为 invalid_added.pdf
input_dir = os.path.dirname(input_pdf_path)
output_pdf_path = os.path.join(input_dir, "invalid_added.pdf")
# 打开输入的PDF文件
with open(input_pdf_path, 'rb') as file:
pdf_reader = PdfReader(file)
pdf_writer = PdfWriter()
total_pages = len(pdf_reader.pages)
# 遍历每一页
for page_num in range(total_pages):
page = pdf_reader.pages[page_num]
pdf_writer.add_page(page)
# 创建一个内存中的PDF用于存放带有文本的空白页
packet = BytesIO()
# 获取当前页面的宽度和高度
page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
# 使用reportlab创建一个新的PDF页面
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
# 计算文本的位置单位1厘米 ≈ 28.35点)
x_position = 2.3 * cm
y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距
# 绘制文本,使用 (page_num + 1) 作为索引
c.setFont("Helvetica", 12) # 设置字体和大小
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
# 完成绘制
c.save()
# 将内存中的PDF读入PyPDF2
packet.seek(0)
new_pdf = PdfReader(packet)
blank_page = new_pdf.pages[0]
# 将带有文本的空白页添加到写入器
pdf_writer.add_page(blank_page)
# 将所有页面写入输出的PDF文件
with open(output_pdf_path, 'wb') as output_file:
pdf_writer.write(output_file)
print("invalid_file added successfully!")
return output_pdf_path
except Exception as e:
print(f"发生错误: {e}")
return input_pdf_path
def delete_mark(docx_path):
"""
删除docx文档中的所有标记
:param docx_path: docx文件路径
"""
docx = Document(docx_path)
find_flag = False
for para in docx.paragraphs:
# 匹配标记: [$$index_mark_X$$]
if "[$$index_mark_" in para.text:
para._element.getparent().remove(para._element) # 删标记
find_flag = True
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
para._element.getparent().remove(para._element)
find_flag = False
# 获取文件路径信息
import os
dir_path = os.path.dirname(docx_path)
new_file_path = os.path.join(dir_path, 'invalid_del.docx')
# 保存修改后的文档
docx.save(new_file_path)
return new_file_path
if __name__ == '__main__':
input=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\baada43d-24f6-459d-8a81-219d130f20da\ztbfile.pdf'
# input=r'C:\Users\Administrator\Desktop\new招标文件\output5\广水市公安局音视频监控系统设备采购项目_procurement.pdf'
output=insert_mark(input)
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\1073c74f-02b5-463e-a129-23c790a3c872\invalid_added.docx'
# delete_mark(doc_path)