217 lines
12 KiB
Python
217 lines
12 KiB
Python
# -*- encoding:utf-8 -*-
|
||
import re
|
||
import json
|
||
import time
|
||
|
||
from flask_app.main.多线程提问 import multi_threading
|
||
from flask_app.main.根据条款号整合json import process_and_merge_entries,process_and_merge2
|
||
from flask_app.main.json_utils import extract_content_from_json
|
||
from flask_app.main.截取pdf import truncate_pdf_main
|
||
from flask_app.main.投标人须知正文条款提取成json文件 import convert_clause_to_json
|
||
prompt = """
|
||
# 角色
|
||
你是一个文档处理专家,专门负责理解和操作基于特定内容的文档任务,这包括解析、总结、搜索或生成与给定文档相关的各类信息。
|
||
|
||
## 技能
|
||
### 技能 1:文档解析与摘要
|
||
- 深入理解并分析${document1}的内容,提取关键信息。
|
||
- 根据需求生成简洁明了的摘要,保持原文核心意义不变。
|
||
|
||
### 技能 2:信息检索与关联
|
||
- 在${document1}中高效检索特定信息或关键词。
|
||
- 能够识别并链接到文档内部或外部的相关内容,增强信息的连贯性和深度。
|
||
|
||
## 限制
|
||
- 所有操作均需基于${document1}的内容,不可超出此范围创造信息。
|
||
- 在处理敏感或机密信息时,需遵守严格的隐私和安全规定。
|
||
- 确保所有生成或改编的内容逻辑连贯,无误导性信息。
|
||
|
||
请注意,上述技能执行时将直接利用并参考${document1}的具体内容,以确保所有产出紧密相关且高质量。
|
||
"""
|
||
|
||
|
||
def update_json_data(original_data, updates1, updates2,second_response_list):
|
||
"""
|
||
根据提供的更新字典覆盖原始JSON数据中对应的键值,支持点分隔的键来表示嵌套结构。
|
||
参数:
|
||
- original_data: dict, 原始的JSON数据。
|
||
- updates: dict, 包含需要更新的键值对。
|
||
- second_response_list: list, 包含多个字典,每个字典包含需要更新的键值对。
|
||
返回:
|
||
- updated_data: dict, 更新后的JSON数据。
|
||
"""
|
||
def recursive_update(data, key, value):
|
||
# 处理点分隔的键,递归定位并更新嵌套字典
|
||
keys = key.split('.')
|
||
for k in keys[:-1]:
|
||
data = data.setdefault(k, {})
|
||
if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
|
||
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
|
||
else:
|
||
data[keys[-1]] = value
|
||
|
||
# 合并 updates 到 original_data 中
|
||
for key, value in updates1.items():
|
||
recursive_update(original_data, key, value)
|
||
for key,value in updates2.items():
|
||
recursive_update(original_data, key, value)
|
||
|
||
# 遍历 second_response_list 中的每个字典,并合并到 original_data 中
|
||
for response_dict in second_response_list:
|
||
for key, value in response_dict.items():
|
||
recursive_update(original_data, key, value)
|
||
|
||
return original_data
|
||
|
||
def judge_second_jump(input_dict):
|
||
formatted_questions = []
|
||
keys_to_remove = []
|
||
for key, value in input_dict.items():
|
||
if '招标公告' in value or '第一章' in value:
|
||
formatted_entry = f"关于'{key}',第一章招标公告中的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||
formatted_questions.append(formatted_entry)
|
||
keys_to_remove.append(key) # 标记需要删除的键
|
||
# 从原始字典中删除处理过的键值对
|
||
for key in keys_to_remove:
|
||
del input_dict[key]
|
||
|
||
return input_dict, formatted_questions
|
||
|
||
def extract_matching_keys(json_data):
|
||
#'投标人名称': '符合招标公告第3.2条规定'
|
||
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
|
||
if isinstance(json_data, str):
|
||
data = json.loads(json_data)
|
||
else:
|
||
data = json_data
|
||
|
||
# 正则表达式匹配
|
||
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
|
||
additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
|
||
exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
|
||
additional_exclude_patterns = ["按要求", "按规定"]
|
||
|
||
# Initialize a list to hold filtered key-value pairs
|
||
final_matching = []
|
||
|
||
# Recursive function to traverse and filter data
|
||
def recursive_search(current_data, path=[]):
|
||
if isinstance(current_data, dict):
|
||
for key, value in current_data.items():
|
||
new_path = path + [key] # Update path for nested keys
|
||
if isinstance(value, (dict, list)):
|
||
recursive_search(value, new_path)
|
||
else:
|
||
process_value(key, str(value), new_path)
|
||
elif isinstance(current_data, list):
|
||
for item in current_data:
|
||
recursive_search(item, path)
|
||
|
||
# Function to process each value against the patterns
|
||
def process_value(key, value, path):
|
||
# Check exclude patterns first
|
||
if any(ex in key or ex in value for ex in exclude_patterns):
|
||
return
|
||
# Main include patterns
|
||
if any(pattern.search(value) for pattern in include_patterns):
|
||
# Additional exclude patterns
|
||
if not any(ex in key or ex in value for ex in additional_exclude_patterns):
|
||
# Additional include patterns
|
||
if any(pattern.search(value) for pattern in additional_include_patterns):
|
||
final_matching.append({".".join(path): value}) # Use dot notation for nested keys
|
||
|
||
# Start the recursive search
|
||
recursive_search(data)
|
||
|
||
return final_matching
|
||
|
||
def reformat_questions(match_keys, input_path, output_folder):
|
||
entries_with_numbers = []
|
||
entries_with_numbers2 = [] # 用于保存特别处理的条目
|
||
formatted_questions = []
|
||
clause2_path = ""
|
||
|
||
# 正则表达式,同时匹配全角和半角括号
|
||
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[\(\(](\d+)[\)\)])?') # 识别包含数字序列的特定格式
|
||
|
||
# 初始化文件处理
|
||
generated_files = None
|
||
file_processed = False
|
||
|
||
for entry in match_keys:
|
||
key, value = next(iter(entry.items()))
|
||
match = pattern.search(value)
|
||
|
||
if '招标公告' in value or '第一章' in value:
|
||
if not file_processed:
|
||
# 只处理一次文件操作
|
||
generated_files = truncate_pdf_main(input_path, output_folder, 5)
|
||
file_processed = True # 设置标志位,避免重复处理文件
|
||
if generated_files:
|
||
clause2_path = convert_clause_to_json(generated_files[0], output_folder, 2)
|
||
|
||
if match and generated_files:
|
||
# 提取序号并添加到entries_with_numbers2
|
||
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
|
||
entries_with_numbers2.append({key: num})
|
||
else:
|
||
revised_value = value.replace('符合', '')
|
||
formatted_entry = f"关于'{key}',{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||
formatted_questions.append(formatted_entry)
|
||
elif match:
|
||
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
|
||
entries_with_numbers.append({key: num})
|
||
else:
|
||
revised_value = value.replace('符合', '')
|
||
formatted_entry = f"关于'{key}',{revised_value}的内容是怎样的?请按json格式给我提供信息,键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'。"
|
||
formatted_questions.append(formatted_entry)
|
||
|
||
return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
|
||
|
||
|
||
def process_reviews(original_dict_data,knowledge_name, truncate0_jsonpath,clause_json_path,input_file,output_folder):
|
||
combined_results2={}
|
||
matched_keys = extract_matching_keys(original_dict_data) #output:[{'形式评审标准.投标文件签字盖章': '符合第一章“投标人须知”第 3.7.3(4)目 规定'}, {'形式评审标准.多标段投标': '符合第二章“投标人须知”第 10.1款规定'}] 提取值中包含"符合"的字典
|
||
entries_with_numbers, formatted_questions1,entries_with_numbers2, clause2_path = reformat_questions(matched_keys,input_file,output_folder)
|
||
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath,
|
||
clause_json_path) # 调用根据条款号整合json.py
|
||
combined_results1, formatted_questions2 = judge_second_jump(combined_results) #判断是否需要二次跳转
|
||
if entries_with_numbers2: #跳转第一章招标公告
|
||
combined_results2=process_and_merge2(entries_with_numbers2,clause2_path)
|
||
|
||
results_2 = multi_threading(formatted_questions1+formatted_questions2, knowledge_name) #无序号的直接问大模型
|
||
first_response_list = []
|
||
|
||
for _, response in results_2:
|
||
try:
|
||
if response and len(response) > 1: # 检查response存在且有至少两个元素
|
||
temp = extract_content_from_json(response[1])
|
||
first_response_list.append(temp)
|
||
else:
|
||
print(f"形式响应评审:Warning: Missing or incomplete response data for query index {_}.")
|
||
except Exception as e:
|
||
print(f"形式响应评审:Error processing response for query index {_}: {e}")
|
||
|
||
# Assume JSON file paths are defined or configured correctly
|
||
# print(entries_with_numbers) #[{'形式评审标准.多标段投标': '3.7.4(5)'}]
|
||
updated_json = update_json_data(original_dict_data, combined_results1, combined_results2,first_response_list)
|
||
return updated_json
|
||
|
||
|
||
if __name__ == "__main__":
|
||
start_time=time.time()
|
||
knowledge_name="zbfile"
|
||
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\truncate_output.json"
|
||
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\clause.json"
|
||
original_dict_data={'形式评审标准': {'投标文件': '符合第一章规定', '投标人名称': '符合招标公告第3.2条规定', '投标文件签字': '符合第二章“投标人须知”第3.7.3(4)目规定', '投标文件格式、内容': '符合第七章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨', '联合体投标人': '提交联合体协议书,并明确联合体牵头人', '报价唯一': '只能有一个有效报价', '多标段投标': '符合第二章“投标人须知”第 10.1款规定'}, '响应性评审标准': {'投标内容': '符合第二章“投标人须知”第1.3.1项规定', '工期': '符合第二章“投标人须知”第1.3.2项规定', '工程质量': '符合第二章“投标人须知”第1.3.3项规定'}}
|
||
|
||
input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile.pdf"
|
||
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\temp8\\3abb6e16-19db-42ad-9504-53bf1072dfe7"
|
||
formal_json = process_reviews(original_dict_data,knowledge_name, truncate_tobidders_table_json_path, clause_path,input_file,output_folder)
|
||
data = json.dumps(formal_json, ensure_ascii=False, indent=4)
|
||
end_time=time.time()
|
||
elapsed_time = end_time - start_time
|
||
print(f"Function execution took {elapsed_time} seconds.")
|
||
print(data)
|
||
|
||
#关于'技术暗标',第二章“投标人须知”规定的内容是怎样的?请按json格式给我提供信息,键名为'技术暗标',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。 |