12.18 截取pdf
This commit is contained in:
parent
b8d310a0b0
commit
ca56a0894f
@ -108,4 +108,22 @@ def convert_to_pdf(file_path):
|
|||||||
return docx2pdf(file_path)
|
return docx2pdf(file_path)
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
|
def get_invalid_file(file_path,output_folder,common_header):
|
||||||
|
pdf_document = PdfReader(file_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
begin_pattern=[regex.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',regex.MULTILINE
|
||||||
|
),
|
||||||
|
regex.compile(
|
||||||
|
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||||
|
regex.MULTILINE
|
||||||
|
)]
|
||||||
|
end_pattern=[regex.compile(
|
||||||
|
r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷',
|
||||||
|
regex.MULTILINE
|
||||||
|
),
|
||||||
|
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', regex.MULTILINE),
|
||||||
|
regex.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*"
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ -5,7 +5,7 @@ import time
|
|||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
from flask_app.general.clean_pdf import clean_page_content
|
from flask_app.general.clean_pdf import clean_page_content
|
||||||
|
|
||||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages
|
from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages,get_invalid_file
|
||||||
from flask_app.general.通用功能函数 import get_global_logger
|
from flask_app.general.通用功能函数 import get_global_logger
|
||||||
|
|
||||||
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
|
def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header,
|
||||||
@ -383,6 +383,21 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
|
|||||||
5: 0 # 无效标
|
5: 0 # 无效标
|
||||||
}.get(selection, 0)
|
}.get(selection, 0)
|
||||||
|
|
||||||
|
if selection == 5: #无效标
|
||||||
|
invalid_path = list(
|
||||||
|
get_invalid_file(
|
||||||
|
file_path,
|
||||||
|
output_folder,
|
||||||
|
begin_pattern,
|
||||||
|
common_header,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if invalid_path:
|
||||||
|
return invalid_path
|
||||||
|
else:
|
||||||
|
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
|
||||||
|
continue
|
||||||
|
|
||||||
if selection == 4: # 投标人须知
|
if selection == 4: # 投标人须知
|
||||||
output_paths = list(
|
output_paths = list(
|
||||||
extract_pages_tobidders_notice(
|
extract_pages_tobidders_notice(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user