diff --git a/flask_app/main/基础信息整合快速版.py b/flask_app/main/基础信息整合快速版.py index f6d2172..62ed72e 100644 --- a/flask_app/main/基础信息整合快速版.py +++ b/flask_app/main/基础信息整合快速版.py @@ -53,7 +53,8 @@ def aggregate_basic_info_engineering(baseinfo_list): dynamic_key_handling(key_groups, relevant_keys_detected) # 创建一个副本以存储未分类的项目 - unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]} + unclassified_items = {k: v for k, v in combined_data.items() if + k not in [item for sublist in key_groups.values() for item in sublist]} # 按键组分类并嵌套 for group_name, keys in key_groups.items(): @@ -73,6 +74,7 @@ def aggregate_basic_info_engineering(baseinfo_list): return combined_data + def dynamic_key_handling(key_groups, detected_keys): # 检查和调整键组配置 for key in detected_keys: @@ -120,6 +122,7 @@ def update_baseinfo_lists(baseinfo_list1, baseinfo_list2): return updated_list + def process_judge_questions(judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1): judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers) judge_consortium = judge_consortium_bidding(baseinfo_list1) @@ -138,6 +141,7 @@ def process_judge_questions(judge_file_path, chosen_numbers, tobidders_notice_ta for question, response in res2: baseinfo_list1.append(clean_json_string(response)) + def process_baseinfo_list(baseinfo_list, tobidders_notice): questions_list = [] for item in baseinfo_list: @@ -157,7 +161,8 @@ def process_baseinfo_list(baseinfo_list, tobidders_notice): else: return [] -def combine_basic_info(merged_baseinfo_path,tobidders_notice_table, tobidders_notice, clause_path): + +def combine_basic_info(merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path): """ 综合和处理基础信息,生成最终的基础信息字典。 @@ -170,7 +175,7 @@ def combine_basic_info(merged_baseinfo_path,tobidders_notice_table, tobidders_no 返回: - dict: 综合后的基础信息。 """ - baseinfo_prompt_file_path='flask_app/static/提示词/基本信息工程标qianwen-long.txt' + baseinfo_prompt_file_path = 'flask_app/static/提示词/基本信息工程标qianwen-long.txt' # baseinfo_prompt_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息工程标qianwen-long.txt' file_id1 = upload_file(merged_baseinfo_path) questions = read_questions_from_file(baseinfo_prompt_file_path) @@ -189,7 +194,8 @@ def combine_basic_info(merged_baseinfo_path,tobidders_notice_table, tobidders_no with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: # 提交两个任务 - future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, baseinfo_list1) + future1 = executor.submit(process_judge_questions, judge_file_path, chosen_numbers, tobidders_notice_table, + baseinfo_list1) future2 = executor.submit(process_baseinfo_list, baseinfo_list1_copy, tobidders_notice) # 等待两个任务完成并获取结果 @@ -205,19 +211,16 @@ def combine_basic_info(merged_baseinfo_path,tobidders_notice_table, tobidders_no aggregated_baseinfo = aggregate_basic_info_engineering(updated_list) return {"基础信息": aggregated_baseinfo} -#TODO:先不带投标人须知正文,如果是未知,再直接问正文, + +# TODO:先不带投标人须知正文,如果是未知,再直接问正文, if __name__ == "__main__": - start_time=time.time() - merged_baseinfo_path="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_merged_baseinfo.pdf" + start_time = time.time() + merged_baseinfo_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_merged_baseinfo.pdf" # output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output" - tobidders_notice_table="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_tobidders_notice_table.pdf" - tobidders_notice="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_tobidders_notice.pdf" - clause_path="C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\clause1.json" - res=combine_basic_info(merged_baseinfo_path,tobidders_notice_table,tobidders_notice,clause_path) - print(json.dumps(res,ensure_ascii=False,indent=4)) - end_time=time.time() - print("elapsed_time:"+str(end_time-start_time)) - - - - + tobidders_notice_table = "C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_tobidders_notice_table.pdf" + tobidders_notice = "C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\zbtest2_tobidders_notice.pdf" + clause_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\special_output\\clause1.json" + res = combine_basic_info(merged_baseinfo_path, tobidders_notice_table, tobidders_notice, clause_path) + print(json.dumps(res, ensure_ascii=False, indent=4)) + end_time = time.time() + print("elapsed_time:" + str(end_time - start_time)) diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index a18d927..ecba6ee 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -196,7 +196,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_ r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon re.MULTILINE ) - print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern + # print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern # 定义基础的 mid_pattern base_mid_pattern = r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)' @@ -204,9 +204,6 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_ rf'{base_mid_pattern}', re.MULTILINE ) - print( - f"使用默认的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印默认的 combined_mid_pattern - continue if start_page is not None and mid_page is None and combined_mid_pattern: @@ -306,6 +303,8 @@ def get_start_and_common_header(input_path): begin_pattern = re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$',re.MULTILINE) pdf_document = PdfReader(input_path) for i, page in enumerate(pdf_document.pages): + if i > 25: + return common_header, 0 # 如果页码大于25,直接返回last_begin_index=0 text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) @@ -371,7 +370,7 @@ def truncate_pdf_main(input_path, output_folder, selection): re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) ), ( - re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函)\s*$', re.MULTILINE), + re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷)\s*$', re.MULTILINE), re.compile(r".*(?:投标人须知|投标人须知前附表)\s*$", re.MULTILINE) ) ] @@ -381,7 +380,12 @@ def truncate_pdf_main(input_path, output_folder, selection): # Selection 5: 无效标 pattern_pairs = [ ( - re.compile(r'第[一二三四五六七八九十]+章\s*(招标公告|.*邀请.*)|第一卷|招标编号:|招标编号:'), + re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函).*|^第一卷|^投标邀请书'), + re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE) + ), + ( + re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷)\s*$', re.MULTILINE), re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE) ) ] @@ -393,8 +397,14 @@ def truncate_pdf_main(input_path, output_folder, selection): for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1): is_secondary_match = (idx == 2) + begin_page = last_begin_index if last_begin_index != 0 else { + 1: 3, #前附表 + 2: 10, #评标 + 3: 5, #资格 + 4: 0, #公告 + 5: 0 #无效标 + }.get(selection, 0) if selection == 1: #投标人须知 - begin_page = 3 output_paths = list( extract_pages_tobidders_notice(input_path,output_folder, begin_pattern, begin_page, common_header)) if output_paths and any(os.path.isfile(f) for f in output_paths): @@ -402,10 +412,6 @@ def truncate_pdf_main(input_path, output_folder, selection): else: # print(f"Selection {selection}: 使用组合提取函数失败。尝试下一个模式对。") continue - if selection == 2: #评分细则 - begin_page = 10 - else: - begin_page = last_begin_index output_paths = extract_pages( input_path, output_folder, @@ -669,7 +675,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu #TODO:目前merged_baseinfo没有包含投标人须知正文。 #投标人须知前附表改为货物标一样的 if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # 可以是单个PDF文件路径或文件夹路径 #zbtest20.pdf zbtest4_evaluation_method.pdf zbtest8.pdf + input_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest7.pdf" # 可以是单个PDF文件路径或文件夹路径 #zbtest20.pdf zbtest4_evaluation_method.pdf zbtest8.pdf # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\新建文件夹" @@ -679,5 +685,5 @@ if __name__ == "__main__": # print(files) selection = 5 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 generated_files = truncate_pdf_main(input_path, output_folder, selection) - print(generated_files) + # print("生成的文件:", generated_files) \ No newline at end of file diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py index 84acd03..d819f8a 100644 --- a/flask_app/main/投标人须知正文条款提取成json文件.py +++ b/flask_app/main/投标人须知正文条款提取成json文件.py @@ -180,7 +180,7 @@ def convert_clause_to_json(input_path,output_folder,type=1): r'^附(?:录|件|表)(?:一)?[::]' ] else: - start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:' + start_word = r'第[一二三四五六七八九十]+章\s*招标公告|第一卷|招标编号:|招标编号:|投标邀请书|投标邀请函' end_phrases=[r'第[一二三四五六七八九十]+章\s*投标人须知',r'投标人须知前附表'] result = convert_to_json(input_path, start_word, end_phrases) #过滤无关信息 # 检查输出文件夹是否存在,如果不存在则创建