From 599915fe29c60418fe645409a55712d1de10a13c Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 5 Nov 2024 09:33:18 +0800 Subject: [PATCH] 11.4 --- flask_app/general/little_zbparse.py | 2 +- flask_app/main/截取pdf.py | 20 +++++++++--------- flask_app/货物标/截取pdf货物标版.py | 32 +++++++++++++++++++---------- 3 files changed, 32 insertions(+), 22 deletions(-) diff --git a/flask_app/general/little_zbparse.py b/flask_app/general/little_zbparse.py index 64f766f..71960d4 100644 --- a/flask_app/general/little_zbparse.py +++ b/flask_app/general/little_zbparse.py @@ -72,7 +72,7 @@ def little_parse_engineering(output_folder, file_path): dict: 包含 '基础信息' 的字典。 """ # 截取特定的工程 PDF 文件 - selections = [ 1,5] #公告+投标人须知前附表 + selections = [ 1,4] #公告+投标人须知前附表 files = truncate_pdf_specific_engineering(file_path, output_folder,selections) if not files: raise ValueError("未找到截取后的文件。") diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index e19e88b..66e388b 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -339,14 +339,14 @@ def truncate_pdf_main(input_path, output_folder, selection): # Selection 2: 评标办法 pattern_pairs = [ ( - re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(磋商|谈判|评标|评定|评审)'), + re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(磋商|谈判|评标|评定|评审)\s*(办法|方法)'), # Alternative begin pattern re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+') # Alternative end pattern ), ( re.compile( - r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)[\u4e00-\u9fff、]*\s*$|\s*评标(办法|方法)前附表\s*$', + r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$|\s*评标(办法|方法)前附表\s*$', re.MULTILINE ), re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) @@ -566,18 +566,18 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu # 投标人须知前附表改为货物标一样的 if __name__ == "__main__": start_time = time.time() - input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf" - # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf" + # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf" + input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf" - output_folder = "C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\tmp" + output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\tmp" # files=truncate_pdf_multiple(input_path,output_folder) - # selections = [5, 1] # 仅处理 selection 5、1 - # files=truncate_pdf_specific_engineering(input_path,output_folder,selections) + selections = [4, 1] # 仅处理 selection 4、1 + files=truncate_pdf_specific_engineering(input_path,output_folder,selections) # print(files) - selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 - generated_files = truncate_pdf_main(input_path, output_folder, selection) - print(generated_files) + # selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 + # generated_files = truncate_pdf_main(input_path, output_folder, selection) + # print(generated_files) # print("生成的文件:", generated_files) end_time = time.time() print("耗时:" + str(end_time - start_time)) diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 4f54f66..a224ab3 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -158,8 +158,18 @@ def get_patterns_for_procurement(): def get_patterns_for_evaluation_method(): + # begin_pattern = re.compile( + # r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、()()]*\s*$', + # re.MULTILINE + # ) begin_pattern = re.compile( - r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)[\u4e00-\u9fff、]*\s*$', re.MULTILINE) + r'第[一二三四五六七八九1-9]+(?:章|部分)\s*' # 匹配“第X章”或“第X部分” + r'(?:[\u4e00-\u9fff、()()]*?)' # 匹配允许的字符(中文、顿号、括号) + r'(?=.*(?:磋商|谈判|评标|评定|评审))' # 确保包含“磋商”、“谈判”、“评标”、“评定”或“评审” + r'(?=.*(?:办法|方法))' # 确保包含“办法”或“方法” + r'[\u4e00-\u9fff、()()]*\s*$', # 继续匹配允许的字符直到行尾 + re.MULTILINE + ) end_pattern = re.compile( r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) return begin_pattern, end_pattern @@ -599,7 +609,7 @@ def process_input(input_path, output_folder, selection, output_suffix): local_output_suffix = "notice" elif selection == 2: begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)[\u4e00-\u9fff、]*') + r'^第[一二三四五六七八九十百千]+(?:章|部分).*(磋商|谈判|评标|评定|评审|办法|方法).*(磋商|谈判|评标|评定|评审|办法|方法)') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') local_output_suffix = "evaluation_method" elif selection == 3: @@ -727,21 +737,21 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1 logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}") return truncate_files - +#TODO:截取还有问题 # TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 唐山投标只有正文,没有附表 if __name__ == "__main__": - # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\货物标\\HBDL-2024-0519-001-招标文件.pdf" - # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf" - input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" + input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" + # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\f8b793b5-aa60-42d3-ae59-a3f474e06610\\ztbfile.pdf" + # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹" # files = truncate_pdf_multiple(input_path, output_folder) - selections = [1,4] - files=truncate_pdf_specific_goods(input_path,output_folder,selections) - print(files) - # selection = 4# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 - # generated_files = truncate_pdf_main(input_path, output_folder, selection) + # selections = [1,4] + # files=truncate_pdf_specific_goods(input_path,output_folder,selections) + # print(files) + selection = 2# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 + generated_files = truncate_pdf_main(input_path, output_folder, selection) # print(generated_files) \ No newline at end of file