11.4
This commit is contained in:
parent
9f2fceb8c2
commit
599915fe29
@ -72,7 +72,7 @@ def little_parse_engineering(output_folder, file_path):
|
|||||||
dict: 包含 '基础信息' 的字典。
|
dict: 包含 '基础信息' 的字典。
|
||||||
"""
|
"""
|
||||||
# 截取特定的工程 PDF 文件
|
# 截取特定的工程 PDF 文件
|
||||||
selections = [ 1,5] #公告+投标人须知前附表
|
selections = [ 1,4] #公告+投标人须知前附表
|
||||||
files = truncate_pdf_specific_engineering(file_path, output_folder,selections)
|
files = truncate_pdf_specific_engineering(file_path, output_folder,selections)
|
||||||
if not files:
|
if not files:
|
||||||
raise ValueError("未找到截取后的文件。")
|
raise ValueError("未找到截取后的文件。")
|
||||||
|
@ -339,14 +339,14 @@ def truncate_pdf_main(input_path, output_folder, selection):
|
|||||||
# Selection 2: 评标办法
|
# Selection 2: 评标办法
|
||||||
pattern_pairs = [
|
pattern_pairs = [
|
||||||
(
|
(
|
||||||
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(磋商|谈判|评标|评定|评审)'),
|
re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(磋商|谈判|评标|评定|评审)\s*(办法|方法)'),
|
||||||
# Alternative begin pattern
|
# Alternative begin pattern
|
||||||
re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||||
# Alternative end pattern
|
# Alternative end pattern
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
re.compile(
|
re.compile(
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)[\u4e00-\u9fff、]*\s*$|\s*评标(办法|方法)前附表\s*$',
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$|\s*评标(办法|方法)前附表\s*$',
|
||||||
re.MULTILINE
|
re.MULTILINE
|
||||||
),
|
),
|
||||||
re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
||||||
@ -566,18 +566,18 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu
|
|||||||
# 投标人须知前附表改为货物标一样的
|
# 投标人须知前附表改为货物标一样的
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf"
|
input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\tmp"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\tmp"
|
||||||
# files=truncate_pdf_multiple(input_path,output_folder)
|
# files=truncate_pdf_multiple(input_path,output_folder)
|
||||||
# selections = [5, 1] # 仅处理 selection 5、1
|
selections = [4, 1] # 仅处理 selection 4、1
|
||||||
# files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
|
||||||
# print(files)
|
# print(files)
|
||||||
selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
# selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
print(generated_files)
|
# print(generated_files)
|
||||||
# print("生成的文件:", generated_files)
|
# print("生成的文件:", generated_files)
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
print("耗时:" + str(end_time - start_time))
|
print("耗时:" + str(end_time - start_time))
|
||||||
|
@ -158,8 +158,18 @@ def get_patterns_for_procurement():
|
|||||||
|
|
||||||
|
|
||||||
def get_patterns_for_evaluation_method():
|
def get_patterns_for_evaluation_method():
|
||||||
|
# begin_pattern = re.compile(
|
||||||
|
# r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、()()]*\s*$',
|
||||||
|
# re.MULTILINE
|
||||||
|
# )
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)[\u4e00-\u9fff、]*\s*$', re.MULTILINE)
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分”
|
||||||
|
r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号)
|
||||||
|
r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审”
|
||||||
|
r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法”
|
||||||
|
r'[\u4e00-\u9fff、()()]*\s*$', # 继续匹配允许的字符直到行尾
|
||||||
|
re.MULTILINE
|
||||||
|
)
|
||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE)
|
||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
@ -599,7 +609,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
|
|||||||
local_output_suffix = "notice"
|
local_output_suffix = "notice"
|
||||||
elif selection == 2:
|
elif selection == 2:
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)[\u4e00-\u9fff、]*')
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(磋商|谈判|评标|评定|评审|办法|方法).*(磋商|谈判|评标|评定|评审|办法|方法)')
|
||||||
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+')
|
||||||
local_output_suffix = "evaluation_method"
|
local_output_suffix = "evaluation_method"
|
||||||
elif selection == 3:
|
elif selection == 3:
|
||||||
@ -727,21 +737,21 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1
|
|||||||
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
|
logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}")
|
||||||
|
|
||||||
return truncate_files
|
return truncate_files
|
||||||
|
#TODO:截取还有问题
|
||||||
|
|
||||||
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 唐山投标只有正文,没有附表
|
# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 唐山投标只有正文,没有附表
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\货物标\\HBDL-2024-0519-001-招标文件.pdf"
|
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf"
|
||||||
input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
|
||||||
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
# input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
|
||||||
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
|
||||||
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
||||||
# files = truncate_pdf_multiple(input_path, output_folder)
|
# files = truncate_pdf_multiple(input_path, output_folder)
|
||||||
selections = [1,4]
|
# selections = [1,4]
|
||||||
files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
# files=truncate_pdf_specific_goods(input_path,output_folder,selections)
|
||||||
print(files)
|
# print(files)
|
||||||
# selection = 4# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
selection = 2# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求
|
||||||
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
# print(generated_files)
|
# print(generated_files)
|
Loading…
x
Reference in New Issue
Block a user