zbparse/flask_app/工程标/形式响应评审.py

365 lines
18 KiB
Python
Raw Normal View History

# -*- encoding:utf-8 -*-
2024-10-21 17:31:48 +08:00
import os
2024-08-29 16:37:09 +08:00
import re
import json
import time
2024-10-22 10:06:22 +08:00
from flask_app.general.多线程提问 import multi_threading
2024-12-06 14:40:22 +08:00
from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
2024-10-22 10:06:22 +08:00
from flask_app.general.json_utils import clean_json_string
2024-12-06 14:40:22 +08:00
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
2024-10-22 10:06:22 +08:00
from flask_app.general.通义千问long import upload_file
2024-11-01 14:39:39 +08:00
from flask_app.general.merge_pdfs import merge_pdfs
2024-08-29 16:37:09 +08:00
prompt = """
# 角色
你是一个文档处理专家专门负责理解和操作基于特定内容的文档任务这包括解析总结搜索或生成与给定文档相关的各类信息
## 技能
### 技能 1文档解析与摘要
- 深入理解并分析${document1}的内容提取关键信息
- 根据需求生成简洁明了的摘要保持原文核心意义不变
### 技能 2信息检索与关联
- ${document1}中高效检索特定信息或关键词
- 能够识别并链接到文档内部或外部的相关内容增强信息的连贯性和深度
## 限制
- 所有操作均需基于${document1}的内容不可超出此范围创造信息
- 在处理敏感或机密信息时需遵守严格的隐私和安全规定
- 确保所有生成或改编的内容逻辑连贯无误导性信息
请注意上述技能执行时将直接利用并参考${document1}的具体内容以确保所有产出紧密相关且高质量
"""
2024-09-03 18:04:05 +08:00
def update_json_data(original_data, updates1, updates2,second_response_list):
"""
根据提供的更新字典覆盖原始JSON数据中对应的键值支持点分隔的键来表示嵌套结构
参数:
- original_data: dict, 原始的JSON数据
- updates: dict, 包含需要更新的键值对
- second_response_list: list, 包含多个字典每个字典包含需要更新的键值对
返回:
- updated_data: dict, 更新后的JSON数据
"""
def recursive_update(data, key, value):
# 处理点分隔的键,递归定位并更新嵌套字典
keys = key.split('.')
for k in keys[:-1]:
data = data.setdefault(k, {})
if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
else:
data[keys[-1]] = value
# 合并 updates 到 original_data 中
for key, value in updates1.items():
recursive_update(original_data, key, value)
for key,value in updates2.items():
recursive_update(original_data, key, value)
# 遍历 second_response_list 中的每个字典,并合并到 original_data 中
for response_dict in second_response_list:
for key, value in response_dict.items():
recursive_update(original_data, key, value)
return original_data
2024-10-21 17:31:48 +08:00
def judge_second_jump(input_dict):
formatted_questions = []
keys_to_remove = []
2024-10-21 17:31:48 +08:00
chapter_pattern = re.compile(r'\s*标\s*公\s*告|第\s*一\s*章')
question_template = (
"请你根据招标文件中的内容回答:关于{short_key}{additional_text},第一章招标公告中的内容是怎样的?"
"请按json格式给我提供信息键名为'{full_key}',而键值需要完全与原文保持一致,不要擅自总结、删减,"
"如果存在未知信息,请在对应键值处填'未知'"
)
for key, value in input_dict.items():
2024-10-21 17:31:48 +08:00
if chapter_pattern.search(value):
short_key = key.split('.')[-1] if '.' in key else key
additional_text = get_additional_text(key)
formatted_entry = question_template.format(
short_key=short_key,
full_key=key,
additional_text=additional_text
)
formatted_questions.append(formatted_entry)
keys_to_remove.append(key) # 标记需要删除的键
2024-10-21 17:31:48 +08:00
# 从原始字典中删除处理过的键值对
for key in keys_to_remove:
del input_dict[key]
return input_dict, formatted_questions
2024-10-21 17:31:48 +08:00
def extract_matching_keys(original_data):
"""
提取带序号的内容对于已经完整的内容不提取但会在后面用新数据更新orginal_data
:param json_data:
{
"投标要求": {
"资质要求": "符合第3章第2条规定",
"业绩要求": "满足招标文件第4章相关条目"
},
"其他信息": {
"投标文件格式": "按要求填写"
}
}
:return:
[
{"投标要求.资质要求": "符合第3章第2条规定"},
{"投标要求.业绩要求": "满足招标文件第4章相关条目"}
]
"""
#'投标人名称': '符合招标公告第3.2条规定'
2024-08-29 16:37:09 +08:00
# 函数首先检查输入 json_data 是否为字符串类型。如果是,它会使用 json.loads() 将字符串解析为字典。
2024-10-21 17:31:48 +08:00
if isinstance(original_data, str):
data = json.loads(original_data)
2024-08-29 16:37:09 +08:00
else:
2024-10-21 17:31:48 +08:00
data = original_data
2024-08-29 16:37:09 +08:00
# 正则表达式匹配
include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
2024-08-29 16:37:09 +08:00
additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
additional_exclude_patterns = ["按要求", "按规定"]
# Initialize a list to hold filtered key-value pairs
final_matching = []
# Recursive function to traverse and filter data
def recursive_search(current_data, path=[]):
if isinstance(current_data, dict):
for key, value in current_data.items():
new_path = path + [key] # Update path for nested keys
if isinstance(value, (dict, list)):
recursive_search(value, new_path)
else:
process_value(key, str(value), new_path)
elif isinstance(current_data, list):
for item in current_data:
recursive_search(item, path)
# Function to process each value against the patterns
def process_value(key, value, path):
# Check exclude patterns first
if any(ex in key or ex in value for ex in exclude_patterns):
return
# Main include patterns
if any(pattern.search(value) for pattern in include_patterns):
# Additional exclude patterns
if not any(ex in key or ex in value for ex in additional_exclude_patterns):
# Additional include patterns
if any(pattern.search(value) for pattern in additional_include_patterns):
final_matching.append({".".join(path): value}) # Use dot notation for nested keys
# Start the recursive search
recursive_search(data)
return final_matching
2024-10-21 17:31:48 +08:00
#同类表达扩展
def get_additional_text(key):
additional_text_map = {
"投标内容": "(或招标范围)",
"工程质量": "(或质量要求)",
# 可以在这里添加更多的映射
}
for keyword, text in additional_text_map.items():
if keyword in key:
return text
return ""
2024-10-30 16:56:05 +08:00
def process_tender_entries(match_keys, output_folder, truncate0_jsonpath, clause_path):
2024-10-21 17:31:48 +08:00
"""
2024-10-30 16:56:05 +08:00
处理招标条目根据提供的匹配键生成不同的输出
:param match_keys: 匹配键列表每个键是一个字典格式如下
[
{"资质要求": "符合相关资质要求"},
{"业绩要求": "3.1.2 近三年完成类似项目"},
{"财务要求": "招标公告 2.3 财务状况良好"},
{"人员要求": "项目经理具有相关资格"}
]
:param output_folder: 输出文件夹路径
:param truncate0_jsonpath: 截断 JSON 路径
:param clause_path: 条款路径
2024-10-21 17:31:48 +08:00
:return:
2024-10-30 16:56:05 +08:00
- entries_with_numbers: 包含数字信息的条目列表可能为空
- formatted_questions: 格式化的问题列表
- entries_with_numbers2: 包含数字信息的第二类条目列表可能为空
- clause2_path: 第二类条目的 JSON 路径可能为空
2024-10-21 17:31:48 +08:00
"""
2024-10-30 16:56:05 +08:00
truncate0_empty = not truncate0_jsonpath or not os.path.exists(truncate0_jsonpath)
clause_empty = not clause_path or not os.path.exists(clause_path)
both_empty = truncate0_empty and clause_empty
2024-08-29 16:37:09 +08:00
entries_with_numbers = []
2024-10-21 17:31:48 +08:00
entries_with_numbers2 = []
2024-08-29 16:37:09 +08:00
formatted_questions = []
clause2_path = ""
2024-08-29 16:37:09 +08:00
# 正则表达式,同时匹配全角和半角括号
2024-10-21 17:31:48 +08:00
pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[$\](\d+)[$\])?')
# 新的正则表达式,用于匹配 '招标公告' 和 '第一章',允许内部有空格
chapter_pattern = re.compile(r'\s*标\s*公\s*告|第\s*一\s*章')
2024-09-26 18:06:23 +08:00
question_template = (
2024-10-30 16:56:05 +08:00
"请你根据招标文件中的内容回答:关于{short_key}{additional_text}{revised_value}的内容是怎样的?"
"请按json格式给我提供信息键名为'{full_key}',而键值需要完全与原文保持一致,不要擅自总结、删减,"
"如果存在未知信息,请在对应键值处填'未知'"
2024-09-26 18:06:23 +08:00
)
2024-10-21 17:31:48 +08:00
def format_num(match):
2024-10-30 16:56:05 +08:00
"""格式化匹配到的数字"""
2024-10-21 17:31:48 +08:00
return match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
2024-10-30 16:56:05 +08:00
2024-10-21 17:31:48 +08:00
def add_formatted_question(key, value):
2024-10-30 16:56:05 +08:00
"""根据键值生成格式化的问题并添加到 formatted_questions 列表中"""
2024-10-21 17:31:48 +08:00
revised_value = value.replace('符合', '')
short_key = key.split('.')[-1] if '.' in key else key
2024-10-30 16:56:05 +08:00
additional_text = get_additional_text(key) # 确保该函数已定义
2024-10-21 17:31:48 +08:00
formatted_questions.append(question_template.format(
short_key=short_key,
full_key=key,
revised_value=revised_value,
additional_text=additional_text
))
2024-10-30 16:56:05 +08:00
# 标记是否已经生成 clause2_path
clause2_generated = False
2024-08-29 16:37:09 +08:00
for entry in match_keys:
key, value = next(iter(entry.items()))
match = pattern.search(value)
2024-10-30 16:56:05 +08:00
if chapter_pattern.search(value): #招标公告
2024-10-21 17:31:48 +08:00
if match:
2024-10-30 16:56:05 +08:00
if not clause2_generated:
notice_path = next(
(os.path.join(root, file) for root, _, files in os.walk(output_folder)
for file in files if file.endswith('_notice.pdf')), ""
)
if notice_path:
clause2_path = convert_clause_to_json(notice_path, output_folder, 2) # 确保该函数已定义
clause2_generated = True # 标记已经生成
entries_with_numbers2.append({key: format_num(match)})
else:
add_formatted_question(key, value)
else:
# 如果已经生成 clause2_path直接添加到 entries_with_numbers2
2024-10-21 17:31:48 +08:00
entries_with_numbers2.append({key: format_num(match)})
else:
2024-10-21 17:31:48 +08:00
add_formatted_question(key, value)
elif match:
2024-10-30 16:56:05 +08:00
if both_empty:
add_formatted_question(key, value)
else:
entries_with_numbers.append({key: format_num(match)})
2024-08-29 16:37:09 +08:00
else:
2024-10-21 17:31:48 +08:00
add_formatted_question(key, value)
2024-08-29 16:37:09 +08:00
2024-10-30 16:56:05 +08:00
# 如果 clause2_path 为空,不生成 entries_with_numbers2
if not clause2_path:
entries_with_numbers2 = []
2024-10-30 16:56:05 +08:00
return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
2024-10-21 17:31:48 +08:00
def fetch_specific_pdf(output_folder):
pdf_paths = []
2024-10-30 16:56:05 +08:00
baseinfo_found = False
2024-10-21 17:31:48 +08:00
# 遍历指定目录
for root, dirs, files in os.walk(output_folder):
for file in files:
2024-10-30 16:56:05 +08:00
# 检查是否找到 merged_baseinfo.pdf
if file.endswith('merged_baseinfo.pdf'):
baseinfo_found = True
2024-10-21 17:31:48 +08:00
full_path = os.path.join(root, file)
pdf_paths.append(full_path)
2024-10-30 16:56:05 +08:00
elif file.endswith('tobidders_notice.pdf'):
# 将找到的 tobidders_notice.pdf 添加到列表中
full_path = os.path.join(root, file)
pdf_paths.append(full_path)
elif not baseinfo_found and file.endswith('invalid.pdf'):
# 如果 merged_baseinfo.pdf 不存在,则返回以 invalid.pdf 结尾的文件路径
return os.path.join(root, file)
if baseinfo_found and pdf_paths:
# 如果找到了 merged_baseinfo.pdf 和其他文件,则进行合并
2024-10-21 17:31:48 +08:00
output_path = os.path.join(output_folder, "merged_reviews.pdf")
output_path=merge_pdfs(pdf_paths, output_path)
2024-10-21 17:31:48 +08:00
return output_path
else:
2024-10-30 16:56:05 +08:00
# 如果未找到 merged_baseinfo.pdf 且没有 invalid.pdf 匹配
return ""
2024-08-29 16:37:09 +08:00
2024-10-21 17:31:48 +08:00
def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, clause_json_path):
combined_results2 = {}
first_response_list = []
2024-10-21 17:31:48 +08:00
matched_keys = extract_matching_keys(original_dict_data)
2024-09-23 14:13:52 +08:00
2024-10-30 16:56:05 +08:00
entries_with_numbers, formatted_questions1, entries_with_numbers2, clause2_path = process_tender_entries(matched_keys, output_folder,truncate0_jsonpath,clause_json_path)
2024-08-29 16:37:09 +08:00
2024-10-30 16:56:05 +08:00
combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path) #返回值可能为{}
2024-10-21 17:31:48 +08:00
2024-10-30 16:56:05 +08:00
combined_results1, formatted_questions2 = judge_second_jump(combined_results) #返回值可能为{} []
2024-10-21 17:31:48 +08:00
if entries_with_numbers2:
combined_results2 = process_and_merge2(entries_with_numbers2, clause2_path)
2024-08-29 16:37:09 +08:00
2024-10-21 17:31:48 +08:00
formatted_questions = formatted_questions1 + formatted_questions2
if formatted_questions:
2024-11-11 17:12:38 +08:00
output_path = fetch_specific_pdf(output_folder) #合并merged_info
2024-10-21 17:31:48 +08:00
if output_path:
file_id = upload_file(output_path)
results = multi_threading(formatted_questions, "", file_id, 2)
first_response_list = [clean_json_string(res) for _, res in results] if results else []
print(first_response_list)
2024-10-21 17:31:48 +08:00
updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list)
return updated_json
2024-08-29 16:37:09 +08:00
2024-10-31 20:12:08 +08:00
#TODO:现在是clause truncate_jsonpath非空就尝试从中提取但是有可能从里面提取失败这个时候可能需要重新问大模型。
2024-08-29 16:37:09 +08:00
if __name__ == "__main__":
start_time=time.time()
2024-10-21 17:31:48 +08:00
# knowledge_name="zbfile"
2024-10-30 16:56:05 +08:00
truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\3395fc1b-432d-407d-a872-fc35e8475aef\\truncate_output.json"
clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\3395fc1b-432d-407d-a872-fc35e8475aef\\clause1.json"
2024-10-21 17:31:48 +08:00
original_dict_data={
"形式评审标准": {
"投标文件": "投标文件能正常打开",
"投标人名称": "与营业执照、资质证书、安全生产许可证一致",
2024-10-30 16:56:05 +08:00
"投标文件签字盖章": "符合第二章“投标人须知”第 3.7.33目规定",
2024-10-21 17:31:48 +08:00
"投标文件格式、内容": "符合第八章“投标文件格式”的要求,实质性内容齐全、关键字迹清晰可辨",
"联合体投标人(如有)": "提交联合体协议书,并明确联合体牵头人",
"报价唯一": "只能有一个有效报价(指投标函中的大写报价)",
2024-10-30 16:56:05 +08:00
"多标段投标": "符合第一章规定",
"“技术暗标”": "符合第二章“投标人须知”第 3.7.45目规定"
2024-10-21 17:31:48 +08:00
},
"响应性评审标准": {
2024-10-30 16:56:05 +08:00
"投标内容": "符合第二章“投标人须知”第 1.3.1 项规定",
"工期": "符合第二章“投标人须知”第 1.3.2 项规定",
"工程质量": "符合招标公告投标人须知”第 1.3.3 项规定",
"投标有效期": "符合第二章“投标人须知”第 3.3.1 项规定",
2024-10-21 17:31:48 +08:00
"投标保证金": "符合第二章“投标人须知”规定",
"权利义务": "投标函附录中的相关承诺符合或优于第四章“合同条款及格式”的相关规定",
"技术标准和要求": "符合第七章“技术标准和要求”规定",
"分包计划": "符合第二章“投标人须知”规定",
"算术错误修正": "1)投标人接受算术错误修正后的报价 2)修正后的报价与投标报价相比偏差率不超过±1%",
"投标价格": "1投标函中的大写报价与已标价工程量清单中的投标总价一致 2投标函中的大写报价不大于本标段招标控制价总价 3算术错误修正后的投标总报价不大于本标段招标控制价总价 4投标报价不低于其成本",
"已标价工程量清单": "1已标价工程量清单项目编码顺序与第五章“工程量清单”给出的项目编码顺序一致 2已标价工程量清单符合第五章“工程量清单”给出的项目编码、项目名称、项目特征、计量单位和工程量 3暂列金额符合第五章“工程量清单”列出的金额 4专业工程暂估价符合第五章“工程量清单”列出的金额 5材料工程设备暂估价符合第五章“工程量清单”列出的单价并计入综合单价 6安全文明施工费、规费和税金等不可竞争费用按照规定的标准计取 7计税方法符合招标文件的约定 8已标价工程量清单项目未填报的项目不超过三项或不超过三项未填报的项目的费用合计按招标控制价相应项目的费用合计计算不超过其投标总报价修正后的投标总报价如有的±1%"
}
}
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test"
# notice_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_notice.pdf"
2024-10-30 16:56:05 +08:00
formal_json = process_reviews(original_dict_data,output_folder, truncate_tobidders_table_json_path, "")
data = json.dumps(formal_json, ensure_ascii=False, indent=4)
2024-08-29 16:37:09 +08:00
end_time=time.time()
elapsed_time = end_time - start_time
print(f"Function execution took {elapsed_time} seconds.")
print(data)
#关于'技术暗标'第二章“投标人须知”规定的内容是怎样的请按json格式给我提供信息键名为'技术暗标',请你忠于原文,回答要求完整准确,不要擅自总结、删减,且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。