# -*- encoding:utf-8 -*- import os import re import json import time from flask_app.general.llm.多线程提问 import multi_threading from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.general.json_utils import clean_json_string from flask_app.general.投标人须知正文条款提取成json文件 import convert_clause_to_json from flask_app.general.llm.通义千问long import upload_file from flask_app.general.merge_pdfs import merge_pdfs def update_json_data(original_data, updates1, updates2,second_response_list): """ 根据提供的更新字典覆盖原始JSON数据中对应的键值,支持点分隔的键来表示嵌套结构。 参数: - original_data: dict, 原始的JSON数据。 - updates: dict, 包含需要更新的键值对。 - second_response_list: list, 包含多个字典,每个字典包含需要更新的键值对。 返回: - updated_data: dict, 更新后的JSON数据。 """ def recursive_update(data, key, value): # 处理点分隔的键,递归定位并更新嵌套字典 keys = key.split('.') for k in keys[:-1]: data = data.setdefault(k, {}) if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict): data[keys[-1]] = {**data.get(keys[-1], {}), **value} else: data[keys[-1]] = value # 合并 updates 到 original_data 中 for key, value in updates1.items(): recursive_update(original_data, key, value) for key,value in updates2.items(): recursive_update(original_data, key, value) # 遍历 second_response_list 中的每个字典,并合并到 original_data 中 for response_dict in second_response_list: for key, value in response_dict.items(): recursive_update(original_data, key, value) return original_data def judge_second_jump(input_dict): formatted_questions = [] keys_to_remove = [] chapter_pattern = re.compile(r'招\s*标\s*公\s*告|第\s*一\s*章') question_template = ( "请你根据招标文件中的内容回答:关于{short_key}{additional_text},第一章招标公告中的内容是怎样的?" "请按json格式给我提供信息,键名为'{full_key}',而键值需要完全与原文保持一致,不要擅自总结、删减," "如果存在未知信息,请在对应键值处填'未知'。" ) for key, value in input_dict.items(): if chapter_pattern.search(value): short_key = key.split('.')[-1] if '.' in key else key additional_text = get_additional_text(key) formatted_entry = question_template.format( short_key=short_key, full_key=key, additional_text=additional_text ) formatted_questions.append(formatted_entry) keys_to_remove.append(key) # 标记需要删除的键 # 从原始字典中删除处理过的键值对 for key in keys_to_remove: del input_dict[key] return input_dict, formatted_questions def extract_matching_keys(original_data): """ 提取带序号的内容,对于已经完整的内容不提取,但会在后面用新数据更新orginal_data :param json_data: { "投标要求": { "资质要求": "符合第3章第2条规定", "业绩要求": "满足招标文件第4章相关条目" }, "其他信息": { "投标文件格式": "按要求填写" } } :return: [ {"投标要求.资质要求": "符合第3章第2条规定"}, {"投标要求.业绩要求": "满足招标文件第4章相关条目"} ] """ #'投标人名称': '符合招标公告第3.2条规定' # 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。 if isinstance(original_data, str): data = json.loads(original_data) else: data = original_data # 正则表达式匹配 include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")] additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")] exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"] additional_exclude_patterns = ["按要求", "按规定"] # Initialize a list to hold filtered key-value pairs final_matching = [] # Recursive function to traverse and filter data def recursive_search(current_data, path=[]): if isinstance(current_data, dict): for key, value in current_data.items(): new_path = path + [key] # Update path for nested keys if isinstance(value, (dict, list)): recursive_search(value, new_path) else: process_value(key, str(value), new_path) elif isinstance(current_data, list): for item in current_data: recursive_search(item, path) # Function to process each value against the patterns def process_value(key, value, path): # Check exclude patterns first if any(ex in key or ex in value for ex in exclude_patterns): return # Main include patterns if any(pattern.search(value) for pattern in include_patterns): # Additional exclude patterns if not any(ex in key or ex in value for ex in additional_exclude_patterns): # Additional include patterns if any(pattern.search(value) for pattern in additional_include_patterns): final_matching.append({".".join(path): value}) # Use dot notation for nested keys # Start the recursive search recursive_search(data) return final_matching #同类表达扩展 def get_additional_text(key): additional_text_map = { "投标内容": "(或招标范围)", "工程质量": "(或质量要求)", # 可以在这里添加更多的映射 } for keyword, text in additional_text_map.items(): if keyword in key: return text return "" def process_tender_entries(match_keys, output_folder, truncate0_jsonpath, clause_path): """ 处理招标条目,根据提供的匹配键生成不同的输出。 :param match_keys: 匹配键列表,每个键是一个字典,格式如下: [ {"资质要求": "符合相关资质要求"}, {"业绩要求": "3.1.2 近三年完成类似项目"}, {"财务要求": "招标公告 2.3 财务状况良好"}, {"人员要求": "项目经理具有相关资格"} ] :param output_folder: 输出文件夹路径 :param truncate0_jsonpath: 截断 JSON 路径 :param clause_path: 条款路径 :return: - entries_with_numbers: 包含数字信息的条目列表(可能为空) - formatted_questions: 格式化的问题列表 - entries_with_numbers2: 包含数字信息的第二类条目列表(可能为空) - clause2_path: 第二类条目的 JSON 路径(可能为空) """ truncate0_empty = not truncate0_jsonpath or not os.path.exists(truncate0_jsonpath) clause_empty = not clause_path or not os.path.exists(clause_path) both_empty = truncate0_empty and clause_empty entries_with_numbers = [] entries_with_numbers2 = [] formatted_questions = [] clause2_path = "" # 正则表达式,同时匹配全角和半角括号 pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[$\(](\d+)[$\)])?') # 新的正则表达式,用于匹配 '招标公告' 和 '第一章',允许内部有空格 chapter_pattern = re.compile(r'招\s*标\s*公\s*告|第\s*一\s*章') question_template = ( "请你根据招标文件中的内容回答:关于{short_key}{additional_text},{revised_value}的内容是怎样的?" "请按json格式给我提供信息,键名为'{full_key}',而键值需要完全与原文保持一致,不要擅自总结、删减," "如果存在未知信息,请在对应键值处填'未知'。" ) def format_num(match): """格式化匹配到的数字""" return match.group(1) + (f"({match.group(2)})" if match.group(2) else "") def add_formatted_question(key, value): """根据键值生成格式化的问题并添加到 formatted_questions 列表中""" revised_value = value.replace('符合', '') short_key = key.split('.')[-1] if '.' in key else key additional_text = get_additional_text(key) # 确保该函数已定义 formatted_questions.append(question_template.format( short_key=short_key, full_key=key, revised_value=revised_value, additional_text=additional_text )) # 标记是否已经生成 clause2_path clause2_generated = False for entry in match_keys: key, value = next(iter(entry.items())) match = pattern.search(value) if chapter_pattern.search(value): #招标公告 if match: if not clause2_generated: notice_path = next( (os.path.join(root, file) for root, _, files in os.walk(output_folder) for file in files if file.endswith('_notice.pdf')), "" ) if notice_path: clause2_path = convert_clause_to_json(notice_path, output_folder, 2) # 确保该函数已定义 clause2_generated = True # 标记已经生成 entries_with_numbers2.append({key: format_num(match)}) else: add_formatted_question(key, value) else: # 如果已经生成 clause2_path,直接添加到 entries_with_numbers2 entries_with_numbers2.append({key: format_num(match)}) else: add_formatted_question(key, value) elif match: if both_empty: add_formatted_question(key, value) else: entries_with_numbers.append({key: format_num(match)}) else: add_formatted_question(key, value) # 如果 clause2_path 为空,不生成 entries_with_numbers2 if not clause2_path: entries_with_numbers2 = [] return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path def fetch_specific_pdf(output_folder): pdf_paths = [] baseinfo_found = False # 遍历指定目录 for root, dirs, files in os.walk(output_folder): for file in files: # 检查是否找到 merged_baseinfo.pdf if file.endswith('merged_baseinfo.pdf'): baseinfo_found = True full_path = os.path.join(root, file) pdf_paths.append(full_path) elif file.endswith('tobidders_notice.pdf'): # 将找到的 tobidders_notice.pdf 添加到列表中 full_path = os.path.join(root, file) pdf_paths.append(full_path) elif not baseinfo_found and file.endswith('invalid.pdf'): # 如果 merged_baseinfo.pdf 不存在,则返回以 invalid.pdf 结尾的文件路径 return os.path.join(root, file) if baseinfo_found and pdf_paths: # 如果找到了 merged_baseinfo.pdf 和其他文件,则进行合并 output_path = os.path.join(output_folder, "merged_reviews.pdf") output_path=merge_pdfs(pdf_paths, output_path) return output_path else: # 如果未找到 merged_baseinfo.pdf 且没有 invalid.pdf 匹配 return "" def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, clause_json_path): combined_results2 = {} first_response_list = [] matched_keys = extract_matching_keys(original_dict_data) entries_with_numbers, formatted_questions1, entries_with_numbers2, clause2_path = process_tender_entries(matched_keys, output_folder,truncate0_jsonpath,clause_json_path) combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path) #返回值可能为{} combined_results1, formatted_questions2 = judge_second_jump(combined_results) #返回值可能为{} [] if entries_with_numbers2: combined_results2 = process_and_merge2(entries_with_numbers2, clause2_path) formatted_questions = formatted_questions1 + formatted_questions2 if formatted_questions: output_path = fetch_specific_pdf(output_folder) #合并merged_info if output_path: file_id = upload_file(output_path) results = multi_threading(formatted_questions, "", file_id, 2) first_response_list = [clean_json_string(res) for _, res in results] if results else [] print(first_response_list) updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list) return updated_json #TODO:现在是clause truncate_jsonpath非空就尝试从中提取,但是有可能从里面提取失败,这个时候可能需要重新问大模型。 if __name__ == "__main__": start_time=time.time() # knowledge_name="zbfile" truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\3395fc1b-432d-407d-a872-fc35e8475aef\\truncate_output.json" clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\3395fc1b-432d-407d-a872-fc35e8475aef\\clause1.json" original_dict_data={ "形式评审标准": { "投标文件": "投标文件能正常打开", "投标人名称": "与营业执照、资质证书、安全生产许可证一致", "投标文件签字盖章": "符合第二章“投标人须知”第 3.7.3(3)目规定", "投标文件格式、内容": "符合第八章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨", "联合体投标人(如有)": "提交联合体协议书,并明确联合体牵头人", "报价唯一": "只能有一个有效报价(指投标函中的大写报价)", "多标段投标": "符合第一章规定", "“技术暗标”": "符合第二章“投标人须知”第 3.7.4(5)目规定" }, "响应性评审标准": { "投标内容": "符合第二章“投标人须知”第 1.3.1 项规定", "工期": "符合第二章“投标人须知”第 1.3.2 项规定", "工程质量": "符合招标公告投标人须知”第 1.3.3 项规定", "投标有效期": "符合第二章“投标人须知”第 3.3.1 项规定", "投标保证金": "符合第二章“投标人须知”规定", "权利义务": "投标函附录中的相关承诺符合或优于第四章“合同条款及格式”的相关规定", "技术标准和要求": "符合第七章“技术标准和要求”规定", "分包计划": "符合第二章“投标人须知”规定", "算术错误修正": "1)投标人接受算术错误修正后的报价 2)修正后的报价与投标报价相比偏差率不超过±1%", "投标价格": "1)投标函中的大写报价与已标价工程量清单中的投标总价一致 2)投标函中的大写报价不大于本标段招标控制价总价 3)算术错误修正后的投标总报价不大于本标段招标控制价总价 4)投标报价不低于其成本", "已标价工程量清单": "1)已标价工程量清单项目编码顺序与第五章“工程量清单”给出的项目编码顺序一致; 2)已标价工程量清单符合第五章“工程量清单”给出的项目编码、项目名称、项目特征、计量单位和工程量; 3)暂列金额符合第五章“工程量清单”列出的金额; 4)专业工程暂估价符合第五章“工程量清单”列出的金额; 5)材料(工程设备)暂估价符合第五章“工程量清单”列出的单价并计入综合单价; 6)安全文明施工费、规费和税金等不可竞争费用,按照规定的标准计取; 7)计税方法符合招标文件的约定; 8)已标价工程量清单项目未填报的项目不超过三项,或不超过三项未填报的项目的费用合计(按招标控制价相应项目的费用合计计算)不超过其投标总报价(修正后的投标总报价,如有)的±1%" } } output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test" # notice_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_notice.pdf" formal_json = process_reviews(original_dict_data,output_folder, truncate_tobidders_table_json_path, "") data = json.dumps(formal_json, ensure_ascii=False, indent=4) end_time=time.time() elapsed_time = end_time - start_time print(f"Function execution took {elapsed_time} seconds.") print(data) #关于'技术暗标',第二章“投标人须知”规定的内容是怎样的?请按json格式给我提供信息,键名为'技术暗标',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。