From ca56a0894f0347e805073e79a74d6f082891c6f6 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Wed, 18 Dec 2024 14:02:46 +0800 Subject: [PATCH] =?UTF-8?q?12.18=20=E6=88=AA=E5=8F=96pdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/截取pdf通用函数.py | 18 ++++++++++++++++++ flask_app/工程标/截取pdf工程标版.py | 17 ++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index 19876d3..30b2ef9 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -108,4 +108,22 @@ def convert_to_pdf(file_path): return docx2pdf(file_path) return file_path +def get_invalid_file(file_path,output_folder,common_header): + pdf_document = PdfReader(file_path) + total_pages = len(pdf_document.pages) + begin_pattern=[regex.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',regex.MULTILINE + ), + regex.compile( + r'.*(?