import os from docx.opc.exceptions import PackageNotFoundError from io import BytesIO from PyPDF2 import PdfReader, PdfWriter from docx import Document from reportlab.pdfgen import canvas from reportlab.lib.units import cm import copy #复制input_pdf_path的内容到invalid_added.pdf 这一步可能报错!绘制新的一页基本不会报错 def insert_mark(input_pdf_path): try: # 构建输出文件路径,与输入文件同目录,名称为 invalid_added.pdf input_dir = os.path.dirname(input_pdf_path) output_pdf_path = os.path.join(input_dir, "invalid_added.pdf") # 打开输入的PDF文件 with open(input_pdf_path, 'rb') as file: pdf_reader = PdfReader(file) pdf_writer = PdfWriter() total_pages = len(pdf_reader.pages) # 遍历每一页 for page_num in range(total_pages): # 添加原始页面 page = pdf_reader.pages[page_num] pdf_writer.add_page(page) # 使用 BytesIO 的上下文管理器创建内存中的 PDF 页面用于标记 with BytesIO() as packet: # 获取当前页面的宽度和高度 page_width = float(page.mediabox.width) page_height = float(page.mediabox.height) # 使用 reportlab 创建一个新的 PDF 页面,页面大小与当前页面相同 c = canvas.Canvas(packet, pagesize=(page_width, page_height)) # 计算文本位置:距左边距 2.3cm,距顶部 0.5cm x_position = 2.3 * cm y_position = page_height - 0.5 * cm c.setFont("Helvetica", 12) c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]") # 完成绘制,将内容写入 BytesIO 流 c.save() # 重置流位置,读取刚刚生成的 PDF 页面 packet.seek(0) new_pdf = PdfReader(packet) # 使用 deepcopy 确保页面对象独立,避免后续修改影响已添加页面 blank_page = copy.deepcopy(new_pdf.pages[0]) pdf_writer.add_page(blank_page) # 将所有页面写入输出的PDF文件 with open(output_pdf_path, 'wb') as output_file: pdf_writer.write(output_file) print("invalid_file added successfully!") return output_pdf_path except Exception as e: print(f"发生错误: {e}") return input_pdf_path def delete_mark(docx_path): """ 删除文档中包含标记文本 "[$$index_mark_" 的段落, 如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落,也一并删除。 修改后的文档保存为 invalid_del.docx,并返回新文件路径。 """ if not docx_path: print("Invalid input: docx_path is None or empty.") return "" try: doc = Document(docx_path) except KeyError as e: print(f"Error opening document: {e}") return "" except PackageNotFoundError as e: print(f"Invalid package: {e}") return "" # 收集所有需要删除的段落,避免在遍历时直接修改列表 paragraphs_to_remove = [] paragraphs = doc.paragraphs i = 0 while i < len(paragraphs): para = paragraphs[i] if "[$$index_mark_" in para.text: paragraphs_to_remove.append(para) # 如果当前段落后面有段落且该段落的 XML 中包含 "w:sectPr",也删除该段落 if i + 1 < len(paragraphs): next_para = paragraphs[i + 1] if "w:sectPr" in next_para._element.xml: paragraphs_to_remove.append(next_para) i += 1 # 跳过下一个段落,避免重复处理 i += 1 # 从文档 XML 中移除收集到的段落 for para in paragraphs_to_remove: parent = para._element.getparent() if parent is not None: parent.remove(para._element) new_file_path = os.path.join(os.path.dirname(docx_path), 'invalid_del.docx') doc.save(new_file_path) return new_file_path if __name__ == '__main__': # input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf' # input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf' # output=insert_mark(input) doc_path = r'C:\Users\Administrator\Desktop\fsdownload\bf0346dc-8711-43f5-839e-e4076acea84a\invalid_added.docx' res=delete_mark(doc_path) if not res: print("yes")