11.5货物标截取优化

This commit is contained in:
zy123 2024-11-05 16:57:04 +08:00
parent ccab078ac3
commit f9f17757f3

View File

@ -474,10 +474,6 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
if start_page1 is None or end_page1 is None:
return "", "",""
# # 保存第一部分的路径
# path1 = save_extracted_pages(pdf_document, start_page1, end_page1, pdf_path, output_folder,
# "tobidders_notice_part1")
# 提取第二部分
start_page2 = end_page1
@ -501,10 +497,6 @@ def extract_pages_twice_tobidders_notice(pdf_document, common_header,begin_page)
if end_page2 is None:
return start_page1, end_page1,end_page1
# # 保存第二部分的路径
# path2 = save_extracted_pages(pdf_document, start_page2, end_page2, pdf_path, output_folder,
# "tobidders_notice_part2")
return start_page1, end_page1,end_page2
def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_pattern, common_header):
@ -524,7 +516,7 @@ def extract_pages_qualification(pdf_document,begin_page,begin_pattern,end_patter
start_page is None
):
if begin_pattern.search(cleaned_text):
print("附件")
print("第二次尝试:匹配附件")
start_page = i
# 确定结束页
if start_page is not None and end_pattern.search(cleaned_text):
@ -562,8 +554,8 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be
if start_page is None or end_page is None:
if output_suffix == "qualification1":
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
print("third:尝试提取评分办法章节...")
# print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
print("第三次尝试资格审查:尝试提取评分办法章节...")
temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
if len(temp) > 0:
return temp[0]
@ -825,9 +817,9 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况类似工程标 唐山投标只有正文,没有附表
#ztbfile.pdf jiao通 广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_qualification2.pdf 唐山 包头
#ztbfile.pdf少资格评审 包头少符合性评审
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\ztbfile.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
@ -838,6 +830,6 @@ if __name__ == "__main__":
# selections = [1,4]
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
# print(files)
selection = 4# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
selection = 3# 例如1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print(generated_files)
print(generated_files)