2024-09-23 17:44:34 +08:00
|
|
|
|
# -*- encoding:utf-8 -*-
|
2024-09-23 15:49:30 +08:00
|
|
|
|
import json
|
2024-09-23 17:44:34 +08:00
|
|
|
|
import re
|
2024-09-23 15:49:30 +08:00
|
|
|
|
|
|
|
|
|
from flask_app.main.基础信息整合 import combine_basic_info
|
|
|
|
|
from flask_app.main.通义千问long import qianwen_long,upload_file
|
|
|
|
|
from flask_app.main.多线程提问 import multi_threading
|
2024-09-23 17:44:34 +08:00
|
|
|
|
from flask_app.main.json_utils import combine_json_results,clean_json_string
|
2024-09-26 13:43:47 +08:00
|
|
|
|
from flask_app.main.形式响应评审 import update_json_data,extract_matching_keys
|
2024-09-23 17:44:34 +08:00
|
|
|
|
#这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据
|
|
|
|
|
#对于同级的键,如果数量>1且键名都统一,那么将键名去掉,用列表保持它们的键值
|
|
|
|
|
#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存
|
|
|
|
|
|
|
|
|
|
def is_numeric_key(key):
|
|
|
|
|
# 这个正则表达式匹配由数字、点、括号中的数字或单个字母(小写或大写)组成的字符串,
|
|
|
|
|
# 字母后跟数字,或数字后跟字母,单个字母后跟点,但不能是字母-数字-字母的组合
|
|
|
|
|
pattern = r'^[\d.]+$|^\(\d+\)$|^(\d+)$|^[a-zA-Z]$|^[a-zA-Z]\d+$|^\d+[a-zA-Z]$|^[a-zA-Z]\.$'
|
|
|
|
|
return re.match(pattern, key) is not None
|
|
|
|
|
#TODO:如果键值中存在数字就不行
|
|
|
|
|
#zbtest20也有问题
|
|
|
|
|
def contains_number_or_index(key, value):
|
|
|
|
|
# 判断值是否是数字或数字字符串
|
|
|
|
|
is_number = isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit())
|
|
|
|
|
# 判断键是否包含 "序号"
|
|
|
|
|
contains_index = '序号' in key
|
|
|
|
|
# 判断值中是否包含数字
|
|
|
|
|
contains_digit = isinstance(value, str) and re.search(r'\d+', value)
|
|
|
|
|
# 判断值中是否包含中文字符
|
|
|
|
|
contains_chinese = isinstance(value, str) and re.search(r'[\u4e00-\u9fff]', value)
|
|
|
|
|
# 如果值中包含数字但也有中文字符,则保留(返回 False)
|
|
|
|
|
if contains_digit and contains_chinese:
|
|
|
|
|
return False
|
|
|
|
|
# 如果值是数字或包含数字,且不包含中文字符,或者键包含 "序号",返回 True
|
|
|
|
|
return is_number or contains_index or contains_digit
|
|
|
|
|
|
2024-09-23 17:57:30 +08:00
|
|
|
|
#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存
|
|
|
|
|
#如果键名是"序号"或者键值中全是数字,删去序号
|
2024-09-23 17:44:34 +08:00
|
|
|
|
def preprocess_dict(data):
|
|
|
|
|
if isinstance(data, dict):
|
|
|
|
|
if len(data) > 1:
|
|
|
|
|
# 检查是否所有值都是 "" 或 "/"
|
|
|
|
|
if all(v == "" or v == "/" for v in data.values()):
|
|
|
|
|
return list(data.keys())
|
|
|
|
|
else:
|
|
|
|
|
processed = {}
|
|
|
|
|
for k, v in data.items():
|
|
|
|
|
if not contains_number_or_index(k, v):
|
|
|
|
|
processed_v = preprocess_dict(v)
|
|
|
|
|
if processed_v != "": # 只添加非空值
|
|
|
|
|
processed[k] = processed_v
|
|
|
|
|
return processed
|
|
|
|
|
else:
|
|
|
|
|
return {k: preprocess_dict(v) for k, v in data.items()}
|
|
|
|
|
elif isinstance(data, list):
|
|
|
|
|
return [preprocess_dict(item) for item in data]
|
|
|
|
|
else:
|
|
|
|
|
return data
|
|
|
|
|
def process_dict(data):
|
|
|
|
|
if not isinstance(data, dict):
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
result = {}
|
|
|
|
|
numeric_keys = []
|
|
|
|
|
non_numeric_keys = {}
|
|
|
|
|
|
|
|
|
|
for key, value in data.items():
|
|
|
|
|
if is_numeric_key(key):
|
|
|
|
|
numeric_keys.append((key, value))
|
|
|
|
|
else:
|
|
|
|
|
non_numeric_keys[key] = value
|
|
|
|
|
|
|
|
|
|
if numeric_keys:
|
|
|
|
|
result['items'] = [process_dict(item[1]) for item in sorted(numeric_keys)]
|
|
|
|
|
|
|
|
|
|
for key, value in non_numeric_keys.items():
|
2024-09-25 18:03:09 +08:00
|
|
|
|
if isinstance(value, list):
|
|
|
|
|
processed_list = []
|
|
|
|
|
for item in value:
|
|
|
|
|
if isinstance(item, dict):
|
|
|
|
|
# 处理字典中只有一个键值对的情况
|
|
|
|
|
if len(item) == 1:
|
|
|
|
|
processed_item = process_dict(list(item.values())[0])
|
|
|
|
|
else:
|
|
|
|
|
processed_item = process_dict(item)
|
|
|
|
|
else:
|
|
|
|
|
processed_item = process_dict(item)
|
|
|
|
|
|
|
|
|
|
# 如果处理后的项是只包含一个元素的列表,则展平它
|
|
|
|
|
if isinstance(processed_item, list) and len(processed_item) == 1:
|
|
|
|
|
processed_item = processed_item[0]
|
|
|
|
|
|
|
|
|
|
processed_list.append(processed_item)
|
|
|
|
|
|
|
|
|
|
result[key] = processed_list
|
2024-09-23 17:44:34 +08:00
|
|
|
|
else:
|
|
|
|
|
result[key] = process_dict(value)
|
|
|
|
|
|
|
|
|
|
if len(result) == 1 and 'items' in result:
|
|
|
|
|
return result['items']
|
|
|
|
|
|
|
|
|
|
return result
|
2024-09-26 13:43:47 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def find_chapter_clause_references(data, parent_key=""):
|
|
|
|
|
result = []
|
|
|
|
|
# 正则匹配"第x章"或"第x款"
|
|
|
|
|
chapter_clause_pattern = re.compile(r'第[一二三四五六七八九十\d]+[章款]')
|
|
|
|
|
|
|
|
|
|
# 遍历字典中的键值对
|
|
|
|
|
for key, value in data.items():
|
|
|
|
|
# 生成当前的完整键名
|
|
|
|
|
full_key = f"{parent_key}.{key}" if parent_key else key
|
|
|
|
|
|
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
# 如果值是字典,递归调用函数
|
|
|
|
|
result.extend(find_chapter_clause_references(value, full_key))
|
|
|
|
|
elif isinstance(value, str):
|
|
|
|
|
# 如果值是字符串,检查是否匹配"第x章"或"第x款"
|
|
|
|
|
if chapter_clause_pattern.search(value):
|
|
|
|
|
result.append({full_key: value})
|
|
|
|
|
|
|
|
|
|
return result
|
2024-09-23 15:49:30 +08:00
|
|
|
|
def qualification_review(truncate_file):
|
|
|
|
|
file_id=upload_file(truncate_file)
|
2024-09-26 13:43:47 +08:00
|
|
|
|
user_query=["该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。","该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"]
|
2024-09-23 15:49:30 +08:00
|
|
|
|
results=multi_threading(user_query,"",file_id,2)
|
2024-09-23 17:44:34 +08:00
|
|
|
|
combined_res = {}
|
2024-09-23 15:49:30 +08:00
|
|
|
|
for question, response in results:
|
2024-09-23 17:44:34 +08:00
|
|
|
|
print(response)
|
|
|
|
|
cleaned_data = clean_json_string(response)
|
|
|
|
|
processed1 = preprocess_dict(cleaned_data)
|
|
|
|
|
processed2 = process_dict(processed1)
|
|
|
|
|
combined_res.update(processed2)
|
2024-09-26 13:43:47 +08:00
|
|
|
|
result=find_chapter_clause_references(combined_res)
|
|
|
|
|
print(result)
|
2024-09-23 17:44:34 +08:00
|
|
|
|
# 整合基础信息核心代码
|
2024-09-23 15:49:30 +08:00
|
|
|
|
return combined_res
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-09-26 13:43:47 +08:00
|
|
|
|
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf"
|
2024-09-23 15:49:30 +08:00
|
|
|
|
res=qualification_review(truncate_file)
|
|
|
|
|
print(json.dumps(res,ensure_ascii=False, indent=4))
|