12.18 截取pdf
This commit is contained in:
parent
b9c1201ec8
commit
ccfbe522f8
@ -24,36 +24,33 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
|
||||
返回:
|
||||
- list: 截取和合并后的文件路径列表,如果截取或合并失败,则包含空字符串。
|
||||
"""
|
||||
if selections is None:
|
||||
selections = [1, 2, 3, 4, 5]
|
||||
|
||||
# 确认模式有效
|
||||
if mode not in ['goods', 'engineering']:
|
||||
raise ValueError("mode 参数必须是 'goods' 或 'engineering'")
|
||||
def handle_exception(selection):
|
||||
return ["", ""] if selection == 4 else [""]
|
||||
# 设置模式相关的参数和函数
|
||||
if mode == 'goods':
|
||||
logger.info("call 货物标截取pdf")
|
||||
truncate_function = truncate_pdf_main_goods
|
||||
merge_mode = 'goods'
|
||||
modes_config = {
|
||||
"goods": {"selections": [1, 2, 3, 4, 5, 6], "truncate_func": truncate_pdf_main_goods},
|
||||
"engineering": {"selections": [1, 2, 3, 4, 5], "truncate_func": truncate_pdf_main_engineering},
|
||||
}
|
||||
|
||||
# 根据 'goods' 模式定义异常处理的逻辑
|
||||
else: # mode == 'engineering'
|
||||
logger.info("call 工程标标截取pdf")
|
||||
truncate_function = truncate_pdf_main_engineering
|
||||
merge_mode = 'engineering'
|
||||
# 验证 mode 是否有效
|
||||
if mode not in modes_config:
|
||||
raise ValueError("mode 参数必须是 'goods' 或 'engineering'")
|
||||
|
||||
num_selections = len(selections)
|
||||
res = check_pdf_pages(pdf_path, logger)
|
||||
if res is not None:
|
||||
return res # 返回包含空字符串的列表
|
||||
# 初始化 mode 参数
|
||||
config = modes_config[mode]
|
||||
truncate_function = config["truncate_func"]
|
||||
selections = selections or config["selections"]
|
||||
|
||||
# 检查 PDF 页数逻辑
|
||||
skip, empty_return = check_pdf_pages(pdf_path, mode, logger)
|
||||
if skip:
|
||||
return empty_return
|
||||
|
||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名
|
||||
truncate_files = []
|
||||
|
||||
# 使用 ThreadPoolExecutor 进行多线程处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor:
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
|
||||
# 提交所有任务并保持 selection 顺序
|
||||
future_to_selection = {
|
||||
selection: executor.submit(
|
||||
@ -73,14 +70,17 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
|
||||
future = future_to_selection.get(selection)
|
||||
try:
|
||||
files = future.result()
|
||||
if files:
|
||||
# 扁平化返回的结果
|
||||
if isinstance(files, list):
|
||||
truncate_files.extend(files)
|
||||
# 无需额外处理,因为 `truncate_function` 已处理空字符串的添加和日志记录
|
||||
elif isinstance(files, str): # 如果返回单个字符串,直接添加
|
||||
truncate_files.append(files)
|
||||
else:
|
||||
logger.warning(f"未知的返回类型: {type(files)},跳过该结果")
|
||||
except Exception as e:
|
||||
logger.error(f"Selection {selection} 生成了一个异常: {e}")
|
||||
# 根据模式和 selection 添加相应数量的空字符串
|
||||
truncate_files.extend(handle_exception(selection))
|
||||
|
||||
# 定义合并后的输出路径
|
||||
merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
|
||||
|
||||
@ -90,7 +90,7 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection
|
||||
truncate_files,
|
||||
merged_output_path,
|
||||
base_file_name,
|
||||
mode=merge_mode
|
||||
mode=mode
|
||||
)
|
||||
|
||||
if merged_path:
|
||||
@ -109,15 +109,17 @@ if __name__ == "__main__":
|
||||
logger=get_global_logger("123")
|
||||
start_time = time.time()
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
||||
|
||||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf"
|
||||
pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
||||
# pdf_path=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件(实高电子显示屏).pdf'
|
||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
|
||||
# selections = [1, 4] # 仅处理 selection 4、1
|
||||
# selections=[5]
|
||||
selections=[6]
|
||||
#engineering
|
||||
files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods')
|
||||
# files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections)
|
||||
files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering')
|
||||
print(files)
|
||||
# selection = 1 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
||||
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
|
||||
|
@ -7,6 +7,7 @@ from PyPDF2 import PdfReader, PdfWriter
|
||||
|
||||
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
|
||||
from flask_app.general.format_change import docx2pdf
|
||||
import concurrent.futures
|
||||
|
||||
|
||||
def get_start_and_common_header(input_path,end_page):
|
||||
@ -34,20 +35,26 @@ def get_start_and_common_header(input_path,end_page):
|
||||
return common_header, last_begin_index
|
||||
return common_header, last_begin_index
|
||||
|
||||
def check_pdf_pages(pdf_path, logger):
|
||||
def check_pdf_pages(pdf_path,mode, logger):
|
||||
try:
|
||||
reader = PdfReader(pdf_path)
|
||||
num_pages = len(reader.pages)
|
||||
logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||||
if num_pages <= 50:
|
||||
logger.info("PDF页数小于或等于50页,跳过切分逻辑。")
|
||||
return ['', '', '', '', '', '', '']
|
||||
if mode=='goods':
|
||||
return True,['', '', '', '', '', '', '','']
|
||||
else:
|
||||
return True,['', '', '', '', '', '', '']
|
||||
# 若页数大于50页,返回None表示继续处理
|
||||
return None
|
||||
return False, []
|
||||
except Exception as e:
|
||||
logger.error(f"无法读取 PDF 页数: {e}")
|
||||
# 返回空列表意味着无法执行后续处理逻辑
|
||||
return ['', '', '', '', '', '', '']
|
||||
if mode == 'goods':
|
||||
return True,['', '', '', '', '', '', '', '']
|
||||
else:
|
||||
return True,['', '', '', '', '', '', '']
|
||||
|
||||
|
||||
def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header):
|
||||
@ -108,22 +115,151 @@ def convert_to_pdf(file_path):
|
||||
return docx2pdf(file_path)
|
||||
return file_path
|
||||
|
||||
def get_invalid_file(file_path,output_folder,common_header):
|
||||
pdf_document = PdfReader(file_path)
|
||||
total_pages = len(pdf_document.pages)
|
||||
begin_pattern=[regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',regex.MULTILINE
|
||||
),
|
||||
|
||||
def get_invalid_file(file_path, output_folder, common_header,begin_page):
|
||||
"""
|
||||
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
||||
begin_pattern从前往后匹配 end_pattern从后往前
|
||||
页数小于100不进行begin_pattern,默认start_page=0
|
||||
页数大于200时end_pattern从前往后匹配
|
||||
|
||||
|
||||
Args:
|
||||
file_path (str): 输入的PDF文件路径。
|
||||
output_folder (str): 提取后文件的输出文件夹路径。
|
||||
common_header (str): 公共头部文本,用于清理每页的内容。
|
||||
|
||||
Returns:
|
||||
list: 包含保存的文件路径的列表,如果提取失败则返回包含空字符串的列表。
|
||||
"""
|
||||
# 定义开始模式
|
||||
begin_patterns = [
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)]
|
||||
end_pattern=[regex.compile(
|
||||
r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', regex.MULTILINE),
|
||||
regex.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*"
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
]
|
||||
|
||||
# 定义结束模式
|
||||
end_patterns = [
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r"\s*(投标文件|响应文件|响应性文件)(?:的)?格式\s*",
|
||||
regex.MULTILINE
|
||||
)
|
||||
]
|
||||
|
||||
# 定义排除模式
|
||||
exclusion_pattern = regex.compile(
|
||||
r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成',
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
try:
|
||||
# 打开PDF文件
|
||||
pdf_document = PdfReader(file_path)
|
||||
total_pages = len(pdf_document.pages)
|
||||
# print(f"PDF总页数: {total_pages}")
|
||||
|
||||
# 提取并清理每页的文本内容
|
||||
page_texts = []
|
||||
for i in range(total_pages):
|
||||
page = pdf_document.pages[i]
|
||||
text = page.extract_text()
|
||||
cleaned_text = clean_page_content(text, common_header) if text else ""
|
||||
page_texts.append(cleaned_text)
|
||||
|
||||
# 定义查找起始页的函数
|
||||
def find_start_page(begin_page):
|
||||
for pattern in begin_patterns:
|
||||
for i in range(begin_page, min(begin_page + 30, total_pages)):
|
||||
text = page_texts[i]
|
||||
if regex.search(exclusion_pattern, text):
|
||||
continue
|
||||
if regex.search(pattern, text):
|
||||
# print(f"起始模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||||
return i
|
||||
# print(f"未在前30页找到模式: {pattern.pattern}")
|
||||
return 0 # 默认从第一页开始
|
||||
|
||||
# 定义查找结束页的函数
|
||||
def find_end_page():
|
||||
if total_pages > 200:
|
||||
# print("总页数大于200页,结束页从前往后查找,跳过前30页。")
|
||||
for pattern in end_patterns:
|
||||
for i in range(30, total_pages):
|
||||
text = page_texts[i]
|
||||
if regex.search(pattern, text):
|
||||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||||
return i
|
||||
else:
|
||||
# print("结束页从后往前查找,确保只剩前30页时停止。")
|
||||
for pattern in end_patterns:
|
||||
for i in range(total_pages - 1, 29, -1):
|
||||
text = page_texts[i]
|
||||
if regex.search(pattern, text):
|
||||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||||
return i
|
||||
|
||||
# 如果没有匹配到,设置end_page逻辑
|
||||
if total_pages > 100:
|
||||
print("未找到结束模式,总页数大于100,设置结束页为前100页。"+file_path)
|
||||
return max(100, int(total_pages * 2 / 3))
|
||||
else:
|
||||
print("未找到结束模式,设置结束页为最后一页。"+file_path)
|
||||
return total_pages - 1
|
||||
|
||||
# 根据总页数决定是否查找起始页
|
||||
if total_pages < 100:
|
||||
# print("总页数少于100页,默认起始页为第一页。")
|
||||
start_page = 0
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future_end = executor.submit(find_end_page)
|
||||
end_page = future_end.result()
|
||||
else:
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future_start = executor.submit(find_start_page,begin_page)
|
||||
future_end = executor.submit(find_end_page)
|
||||
start_page = future_start.result()
|
||||
end_page = future_end.result()
|
||||
|
||||
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
|
||||
|
||||
# 验证页码范围
|
||||
if start_page > end_page:
|
||||
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
|
||||
return [""]
|
||||
|
||||
# 调用已实现的保存函数
|
||||
output_path = save_extracted_pages(
|
||||
pdf_path=file_path,
|
||||
output_folder=output_folder,
|
||||
start_page=start_page,
|
||||
end_page=end_page,
|
||||
output_suffix="invalid",
|
||||
common_header=common_header
|
||||
)
|
||||
|
||||
# print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
|
||||
return output_path
|
||||
|
||||
except Exception as e:
|
||||
print(f"处理文件 {file_path} 时发生错误: {e}")
|
||||
return ""
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path=r'C:\Users\Administrator\Desktop\new招标文件\货物标\HBDL-2024-0310-001-招标文件.pdf'
|
||||
output_folder=r'C:\Users\Administrator\Desktop\new招标文件\货物标\tmp'
|
||||
res=get_invalid_file(file_path,output_folder,"")
|
||||
|
||||
|
@ -54,7 +54,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
tobidders_notice_table = truncate_files[3] #投标人须知前附表
|
||||
tobidders_notice=truncate_files[4] #投标人须知正文
|
||||
|
||||
invalid_path=truncate_files[5] if truncate_files[5]!="" else pdf_path #无效标
|
||||
invalid_path=truncate_files[5] if truncate_files[5] != "" else pdf_path #无效标
|
||||
# invalid_docpath = copy_docx(docx_path) # docx截取无效标部分
|
||||
# invalid_docpath=pdf2docx(invalid_path)
|
||||
invalid_added_pdf = insert_mark(invalid_path)
|
||||
|
@ -43,7 +43,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
|
||||
|
||||
# 处理各个部分
|
||||
invalid_path=pdf_path
|
||||
invalid_path=truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标
|
||||
|
||||
invalid_added_pdf = insert_mark(invalid_path)
|
||||
invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path
|
||||
@ -62,7 +62,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
|
||||
qualification_path = truncate_files[2] # 资格审查
|
||||
tobidders_notice_path = truncate_files[4] # 投标人须知正文
|
||||
notice_path = truncate_files[0] #招标公告
|
||||
merged_baseinfo_path = truncate_files[6] # 合并封面+招标公告+投标人须知前附表+须知正文
|
||||
merged_baseinfo_path = truncate_files[7] # 合并封面+招标公告+投标人须知前附表+须知正文
|
||||
clause_path = convert_clause_to_json(tobidders_notice_path, output_folder) # 投标人须知正文条款pdf->json
|
||||
|
||||
end_time = time.time()
|
||||
|
@ -1,10 +1,9 @@
|
||||
import regex
|
||||
|
||||
begin_pattern = regex.compile(
|
||||
r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)'
|
||||
r'(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)\s*须知正文\s*$',
|
||||
regex.MULTILINE
|
||||
)
|
||||
r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷',
|
||||
regex.MULTILINE
|
||||
)
|
||||
|
||||
|
||||
# 测试示例
|
||||
|
@ -220,14 +220,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
||||
return []
|
||||
else:
|
||||
return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
|
||||
elif output_suffix == "invalid":
|
||||
pdf_document = PdfReader(pdf_path)
|
||||
total_pages = len(pdf_document.pages)
|
||||
# 计算总页数的三分之二
|
||||
total = int(total_pages * 2 / 3)
|
||||
start_page = last_begin_index
|
||||
end_page = min(90, total)
|
||||
return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)]
|
||||
else:
|
||||
print(f"{output_suffix} twice: 未定义的输出后缀。")
|
||||
return []
|
||||
@ -346,29 +338,35 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
|
||||
]
|
||||
output_suffix = "tobidders_notice"
|
||||
elif selection == 5:
|
||||
# Selection 5: 无效标
|
||||
pattern_pairs = [
|
||||
(
|
||||
regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
|
||||
),
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷',
|
||||
regex.MULTILINE
|
||||
)
|
||||
),
|
||||
(
|
||||
regex.compile(
|
||||
r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
regex.MULTILINE
|
||||
),
|
||||
regex.compile(
|
||||
r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷',
|
||||
regex.MULTILINE
|
||||
)
|
||||
)
|
||||
]
|
||||
output_suffix = "invalid"
|
||||
begin_page=last_begin_index
|
||||
invalid_path = get_invalid_file(file_path,output_folder,common_header,begin_page)
|
||||
if invalid_path:
|
||||
return [invalid_path]
|
||||
else:
|
||||
return [""]
|
||||
# # Selection 5: 无效标
|
||||
# pattern_pairs = [
|
||||
# (
|
||||
# regex.compile(
|
||||
# r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'
|
||||
# ),
|
||||
# regex.compile(
|
||||
# r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷',
|
||||
# regex.MULTILINE
|
||||
# )
|
||||
# ),
|
||||
# (
|
||||
# regex.compile(
|
||||
# r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\))]?\s*$',
|
||||
# regex.MULTILINE
|
||||
# ),
|
||||
# regex.compile(
|
||||
# r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷',
|
||||
# regex.MULTILINE
|
||||
# )
|
||||
# )
|
||||
# ]
|
||||
# output_suffix = "invalid"
|
||||
else:
|
||||
print("无效的选择:请选择1-5")
|
||||
return [""]
|
||||
@ -383,21 +381,6 @@ def truncate_pdf_main_engineering(input_path, output_folder, selection,logger,ou
|
||||
5: 0 # 无效标
|
||||
}.get(selection, 0)
|
||||
|
||||
if selection == 5: #无效标
|
||||
invalid_path = list(
|
||||
get_invalid_file(
|
||||
file_path,
|
||||
output_folder,
|
||||
begin_pattern,
|
||||
common_header,
|
||||
)
|
||||
)
|
||||
if invalid_path:
|
||||
return invalid_path
|
||||
else:
|
||||
# print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。")
|
||||
continue
|
||||
|
||||
if selection == 4: # 投标人须知
|
||||
output_paths = list(
|
||||
extract_pages_tobidders_notice(
|
||||
@ -474,14 +457,14 @@ if __name__ == "__main__":
|
||||
logger=get_global_logger("123")
|
||||
start_time = time.time()
|
||||
# input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4_evaluation_method.pdf"
|
||||
pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles"
|
||||
|
||||
# pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
|
||||
# input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
|
||||
output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
|
||||
# selection = 1 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
||||
# generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
|
||||
# print(generated_files)
|
||||
output_folder = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\tmp"
|
||||
selection = 5 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
|
||||
generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
|
||||
print(generated_files)
|
||||
# print("生成的文件:", generated_files)
|
||||
end_time = time.time()
|
||||
print("耗时:" + str(end_time - start_time))
|
||||
|
@ -4,7 +4,7 @@ import os # 用于文件和文件夹操作
|
||||
from flask_app.general.clean_pdf import clean_page_content, extract_common_header
|
||||
from flask_app.general.merge_pdfs import merge_and_cleanup
|
||||
from flask_app.general.截取pdf通用函数 import get_start_and_common_header, save_extracted_pages, is_pdf_or_doc, \
|
||||
convert_to_pdf
|
||||
convert_to_pdf, get_invalid_file
|
||||
from flask_app.general.通用功能函数 import get_global_logger
|
||||
|
||||
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
||||
@ -473,6 +473,7 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
||||
os.makedirs(output_folder)
|
||||
|
||||
# 获取起始和通用页眉
|
||||
pdf_path = convert_to_pdf(input_path)
|
||||
common_header, last_begin_index = get_start_and_common_header(input_path, 10)
|
||||
begin_page = last_begin_index if last_begin_index != 0 else {
|
||||
4: 1,
|
||||
@ -529,13 +530,12 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
||||
)
|
||||
local_output_suffix = "procurement"
|
||||
elif selection == 6:
|
||||
begin_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*'
|
||||
)
|
||||
end_pattern = regex.compile(
|
||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE
|
||||
)
|
||||
local_output_suffix = "format"
|
||||
begin_page = last_begin_index
|
||||
invalid_path = get_invalid_file(pdf_path, output_folder, common_header, begin_page)
|
||||
if invalid_path:
|
||||
return [invalid_path]
|
||||
else:
|
||||
return [""]
|
||||
else:
|
||||
print("无效的选择:请选择1-6")
|
||||
return ['']
|
||||
@ -545,7 +545,6 @@ def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_
|
||||
output_suffix = local_output_suffix
|
||||
|
||||
# 将原先的 process_files 逻辑合并到此处
|
||||
pdf_path = convert_to_pdf(input_path)
|
||||
result = extract_pages(
|
||||
pdf_path,
|
||||
output_folder,
|
||||
|
Loading…
x
Reference in New Issue
Block a user