12.18 截取pdf

This commit is contained in:
zy123 2024-12-18 14:02:46 +08:00
parent b8d310a0b0
commit ca56a0894f
2 changed files with 34 additions and 1 deletions

View File

@ -108,4 +108,22 @@ def convert_to_pdf(file_path):
return docx2pdf(file_path) return docx2pdf(file_path)
return file_path return file_path
def get_invalid_file(file_path,output_folder,common_header):
pdf_document = PdfReader(file_path)
total_pages = len(pdf_document.pages)
begin_pattern=[regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',regex.MULTILINE
),
regex.compile(
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)]?\s*$',
regex.MULTILINE
)]
end_pattern=[regex.compile(
r'第[一二三四五六七八九十]+章\s*合同|[:]清标报告|^第二卷',
regex.MULTILINE
),
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', regex.MULTILINE),
regex.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*"
]

View File

@ -5,7 +5,7 @@ import time
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.clean_pdf import clean_page_content from flask_app.general.clean_pdf import clean_page_content
from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages,get_invalid_file
from flask_app.general.通用功能函数 import get_global_logger from flask_app.general.通用功能函数 import get_global_logger
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header, def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
@ -383,6 +383,21 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
5: 0 # 无效标 5: 0 # 无效标
}.get(selection, 0) }.get(selection, 0)
if selection == 5: #无效标
invalid_path = list(
get_invalid_file(
file_path,
output_folder,
begin_pattern,
common_header,
)
)
if invalid_path:
return invalid_path
else:
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
continue
if selection == 4: # 投标人须知 if selection == 4: # 投标人须知
output_paths = list( output_paths = list(
extract_pages_tobidders_notice( extract_pages_tobidders_notice(