From d4d1a14c0614bb4abcded3f8896e0e6994746fe1 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Wed, 6 Nov 2024 09:17:40 +0800 Subject: [PATCH] =?UTF-8?q?11.5=E8=B4=A7=E7=89=A9=E6=A0=87=E6=88=AA?= =?UTF-8?q?=E5=8F=96=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/清除file_id.py | 23 ++++++++++++ flask_app/main/截取pdf.py | 21 +++++------ flask_app/货物标/截取pdf货物标版.py | 54 +++++++++-------------------- flask_app/货物标/资格审查main.py | 2 +- 4 files changed, 52 insertions(+), 48 deletions(-) create mode 100644 flask_app/general/清除file_id.py diff --git a/flask_app/general/清除file_id.py b/flask_app/general/清除file_id.py new file mode 100644 index 0000000..ebe8988 --- /dev/null +++ b/flask_app/general/清除file_id.py @@ -0,0 +1,23 @@ +import os +from openai import OpenAI +import json + +# 初始化 OpenAI 客户端 +client = OpenAI( + api_key=os.getenv("DASHSCOPE_API_KEY"), + base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", +) + +# 获取文件列表 +file_stk = client.files.list() + +# 将文件信息解析为 JSON 格式 +file_data = json.loads(file_stk.model_dump_json()) + +# 提取所有文件的 id +file_ids = [file["id"] for file in file_data["data"]] + +# 循环删除每个文件 +for file_id in file_ids: + file_object = client.files.delete(file_id) + print(file_object.model_dump_json()) diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index c6b09be..8466848 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -337,17 +337,17 @@ def truncate_pdf_main(input_path, output_folder, selection): # Selection 2: 评标办法 pattern_pairs = [ ( - re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(磋商|谈判|评标|评定|评审)\s*(办法|方法)'), + re.compile(r'^第[一二三四五六七八九十]+(?:章|部分)\s*(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))'), # Alternative begin pattern re.compile(r'评标办法正文|评标办法|^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+') # Alternative end pattern ), ( re.compile( - r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(磋商|谈判|评标|评定|评审)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$|\s*评标(办法|方法)前附表\s*$', + r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()()]*?(?=.*(?:磋商|谈判|评标|评定|评审))(?=.*(?:办法|方法))[\u4e00-\u9fff、()()]*\s*$|\s*评标(办法|方法)前附表\s*$', re.MULTILINE ), - re.compile(r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) + re.compile(r'第[一二三四五六七八九十1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) ) ] output_suffix = "evaluation_method" @@ -559,22 +559,23 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu # TODO:需要完善二次请求。目前invalid一定能返回 前附表 须知正文如果为空的话要额外处理一下,比如说就不进行跳转(见xx表) 开评定标这里也要考虑 如果评分表为空,也要处理。 -# TODO:zbtest8 zbtest18有问题 后期需要完善,截取需要截两次,第一次严格第二次宽松 # TODO:目前merged_baseinfo没有包含投标人须知正文。 + +#TODO:zbtest20有问题 # 投标人须知前附表改为货物标一样的 if __name__ == "__main__": start_time = time.time() # input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\工程标\\HBDL-2024-0017-001-招标文件.pdf" - input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf" + # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\ztbfile.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" - # input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest1.pdf" + input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹" output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\0b1861e6-c7f6-4541-9182-b1384ba84f3b\\tmp" # files=truncate_pdf_multiple(input_path,output_folder) - selections = [4, 1] # 仅处理 selection 4、1 - files=truncate_pdf_specific_engineering(input_path,output_folder,selections) + # selections = [4, 1] # 仅处理 selection 4、1 + # files=truncate_pdf_specific_engineering(input_path,output_folder,selections) # print(files) - # selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 - # generated_files = truncate_pdf_main(input_path, output_folder, selection) + selection = 2 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 + generated_files = truncate_pdf_main(input_path, output_folder, selection) # print(generated_files) # print("生成的文件:", generated_files) end_time = time.time() diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 82efd13..8a1acf8 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -151,11 +151,12 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter def get_patterns_for_procurement(): begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|' - r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE) - end_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*', + re.MULTILINE) + end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) + # begin_pattern=re.compile( + # r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、]*?(?:服务|项目|商务|技术)\s*(办法|方法)[\u4e00-\u9fff、]*\s*$' + # ) return begin_pattern, end_pattern @@ -176,21 +177,6 @@ def get_patterns_for_evaluation_method(): r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$', re.MULTILINE) return begin_pattern, end_pattern - -# def get_patterns_for_qualification(): -# # # 原始匹配逻辑 -# # begin_pattern_original = re.compile( -# # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE) -# # end_pattern_original = re.compile( -# # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) -# -# # 新匹配逻辑 -# begin_pattern_new = re.compile( -# r'^资格性检查', re.MULTILINE) -# end_pattern_new = re.compile( -# r'^附件\s*\d+|^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) -# -# return begin_pattern_new, end_pattern_new def get_patterns_for_qualification(): # begin_pattern = re.compile( # r'^(?:附录(?:一|1)?[::]|附件(?:一|1)?[::]|附表(?:一|1)?[::]|资格性检查|资格审查表).*(?:资格|符合性)?.*$', @@ -570,20 +556,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header,be return "" -# def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): -# if output_suffix=='notice': -# print(start_page) -# base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] -# output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") -# output_doc = PdfWriter() -# for page_num in range(start_page, end_page + 1): -# output_doc.add_page(pdf_document.pages[page_num]) -# with open(output_pdf_path, 'wb') as f: -# output_doc.write(f) -# print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") -# return output_pdf_path - - def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): try: # 检查 start_page 和 end_page 是否为 None @@ -701,9 +673,17 @@ def process_input(input_path, output_folder, selection, output_suffix): local_output_suffix = "tobidders_notice" elif selection == 5: begin_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*|^第[一二三四五六七八九十百千]+(?:章|部分).*?需求.*') + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务|技术).*?要求|^第[一二三四五六七八九十百千]+(?:章|部分).*(?:采购|需求).*') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+') local_output_suffix = "procurement" + + # begin_pattern = re.compile( + # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|' + # r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|' + # r'^[一二三四五六七八九十百千]+、\s*采购清单', re.MULTILINE) + # end_pattern = re.compile( + # r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) + elif selection == 6: begin_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:格式).*') end_pattern = re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) @@ -825,11 +805,11 @@ if __name__ == "__main__": # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf" # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp" - output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹" + output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1" # files = truncate_pdf_multiple(input_path, output_folder) # selections = [1,4] # files=truncate_pdf_specific_goods(input_path,output_folder,selections) # print(files) - selection = 3# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 + selection = 5# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 generated_files = truncate_pdf_main(input_path, output_folder, selection) print(generated_files) \ No newline at end of file diff --git a/flask_app/货物标/资格审查main.py b/flask_app/货物标/资格审查main.py index 34ae4ec..cc34f28 100644 --- a/flask_app/货物标/资格审查main.py +++ b/flask_app/货物标/资格审查main.py @@ -434,7 +434,7 @@ def combine_qualification_review(invalid_path, output_folder, qualification_path else: file_to_process = qualification_path - combined_res = process_file(file_to_process) + combined_res = process_file(file_to_process,invalid_path) match_keys = find_chapter_clause_references(combined_res) if not match_keys: