12.18 截取pdf
This commit is contained in:
parent
b8d310a0b0
commit
ca56a0894f
@ -108,4 +108,22 @@ def convert_to_pdf(file_path):
|
||||
return docx2pdf(file_path)
|
||||
return file_path
|
||||
|
||||
def get_invalid_file(file_path,output_folder,common_header):
|
||||
pdf_document = PdfReader(file_path)
|
||||
total_pages = len(pdf_document.pages)
|
||||
begin_pattern=[regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)]
|
||||
end_pattern=[regex.compile(
|
||||
r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', regex.MULTILINE),
|
||||
regex.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*"
|
||||
]
|
||||
|
||||
|
||||
|
@ -5,7 +5,7 @@ import time
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
from flask_app.general.clean_pdf import clean_page_content
|
||||
|
||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages
|
||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages,get_invalid_file
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
|
||||
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
|
||||
@ -383,6 +383,21 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
|
||||
5: 0 # 无效标
|
||||
}.get(selection, 0)
|
||||
|
||||
if selection == 5: #无效标
|
||||
invalid_path = list(
|
||||
get_invalid_file(
|
||||
file_path,
|
||||
output_folder,
|
||||
begin_pattern,
|
||||
common_header,
|
||||
)
|
||||
)
|
||||
if invalid_path:
|
||||
return invalid_path
|
||||
else:
|
||||
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
|
||||
continue
|
||||
|
||||
if selection == 4: # 投标人须知
|
||||
output_paths = list(
|
||||
extract_pages_tobidders_notice(
|
||||
|
Loading…
x
Reference in New Issue
Block a user