zbparse/flask_app/main/merge_pdfs.py

import os

from PyPDF2 import PdfReader, PdfWriter
#合并PDF
def merge_pdfs(paths, output_path):
    pdf_writer = PdfWriter()
    last_page_text = None  # 用于存储上一个PDF的最后一页的文本

    for path in paths:
        pdf_reader = PdfReader(path)
        pages = pdf_reader.pages
        start_index = 0  # 从第一页开始添加

        # 如果这不是第一个文件，并且有上一个文件的最后一页文本
        if last_page_text is not None and len(pages) > 0:
            current_first_page_text = pages[0].extract_text() if pages[0].extract_text() else ""
            # 比较当前文件的第一页和上一个文件的最后一页的文本
            if current_first_page_text == last_page_text:
                start_index = 1  # 如果相同，跳过当前文件的第一页

        # 添加当前PDF的页面到写入器
        for page in range(start_index, len(pages)):
            pdf_writer.add_page(pages[page])

        # 更新last_page_text为当前PDF的最后一页的文本
        if len(pages) > 0:
            last_page_text = pages[-1].extract_text() if pages[-1].extract_text() else ""

    # 写入合并后的PDF到文件
    with open(output_path, 'wb') as out:
        pdf_writer.write(out)

def judge_file_exist(original_path, new_suffix):
    # 提取目录路径和原始文件名
    directory = os.path.dirname(original_path)
    original_filename = os.path.basename(original_path)

    # 替换文件名中的旧后缀为新后缀
    # 假设原始文件名格式为 '2-招标文件_qualification.pdf'
    # 需要替换 '_qualification' 部分为 '_qualification2'
    new_filename = original_filename.replace("_qualification1", f"_{new_suffix}")
    new_filename = new_filename.replace("_qualification2", f"_{new_suffix}")
    # 生成新的文件路径
    new_file_path = os.path.join(directory, new_filename)

    # 检查新文件是否存在
    if os.path.isfile(new_file_path):
        return new_file_path
    else:
        return None

def merge_and_cleanup(output_pdf_path, suffix_to_merge):
    another_file_path = judge_file_exist(output_pdf_path, suffix_to_merge)
    if another_file_path:
        paths = [output_pdf_path, another_file_path]  # 需要合并的 PDF 文件路径
        merge_pdfs(paths, output_pdf_path)
        os.remove(another_file_path)
        print(f"文件 {another_file_path} 已删除。")