2024-09-23 17:44:34 +08:00
|
|
|
|
# -*- encoding:utf-8 -*-
|
2024-09-23 15:49:30 +08:00
|
|
|
|
import json
|
2024-10-22 21:02:54 +08:00
|
|
|
|
import os
|
2024-09-23 17:44:34 +08:00
|
|
|
|
import re
|
2024-11-07 15:46:24 +08:00
|
|
|
|
import time
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
|
2024-11-05 16:29:32 +08:00
|
|
|
|
from flask_app.general.通义千问long import upload_file, qianwen_long
|
2024-10-22 10:06:22 +08:00
|
|
|
|
from flask_app.general.多线程提问 import multi_threading
|
|
|
|
|
from flask_app.general.json_utils import clean_json_string
|
2024-10-12 18:01:59 +08:00
|
|
|
|
from flask_app.货物标.投标人须知正文条款提取成json文件货物标版 import convert_clause_to_json
|
2024-11-01 17:55:26 +08:00
|
|
|
|
import copy
|
2024-11-05 16:29:32 +08:00
|
|
|
|
import concurrent.futures
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除,重新组织成一个字典格式的数据,你可以考虑用字符串列表来保持部分平级的数据
|
|
|
|
|
# 对于同级的键,如果数量>1且键名都统一,那么将键名去掉,用列表保持它们的键值
|
2024-09-23 17:44:34 +08:00
|
|
|
|
|
|
|
|
|
def is_numeric_key(key):
|
|
|
|
|
# 这个正则表达式匹配由数字、点、括号中的数字或单个字母(小写或大写)组成的字符串,
|
|
|
|
|
# 字母后跟数字,或数字后跟字母,单个字母后跟点,但不能是字母-数字-字母的组合
|
|
|
|
|
pattern = r'^[\d.]+$|^\(\d+\)$|^(\d+)$|^[a-zA-Z]$|^[a-zA-Z]\d+$|^\d+[a-zA-Z]$|^[a-zA-Z]\.$'
|
|
|
|
|
return re.match(pattern, key) is not None
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-10-30 18:08:46 +08:00
|
|
|
|
|
2024-09-23 17:44:34 +08:00
|
|
|
|
def contains_number_or_index(key, value):
|
|
|
|
|
# 判断值是否是数字或数字字符串
|
|
|
|
|
is_number = isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit())
|
|
|
|
|
# 判断键是否包含 "序号"
|
|
|
|
|
contains_index = '序号' in key
|
|
|
|
|
# 判断值中是否包含数字
|
|
|
|
|
contains_digit = isinstance(value, str) and re.search(r'\d+', value)
|
|
|
|
|
# 判断值中是否包含中文字符
|
|
|
|
|
contains_chinese = isinstance(value, str) and re.search(r'[\u4e00-\u9fff]', value)
|
|
|
|
|
# 如果值中包含数字但也有中文字符,则保留(返回 False)
|
|
|
|
|
if contains_digit and contains_chinese:
|
|
|
|
|
return False
|
|
|
|
|
# 如果值是数字或包含数字,且不包含中文字符,或者键包含 "序号",返回 True
|
|
|
|
|
return is_number or contains_index or contains_digit
|
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
|
|
|
|
# 对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存
|
|
|
|
|
# 如果键名是"序号"或者键值中全是数字,删去序号
|
2024-09-23 17:44:34 +08:00
|
|
|
|
def preprocess_dict(data):
|
|
|
|
|
if isinstance(data, dict):
|
|
|
|
|
if len(data) > 1:
|
|
|
|
|
# 检查是否所有值都是 "" 或 "/"
|
2024-10-12 18:01:59 +08:00
|
|
|
|
if all(v == "" or v == "/" or (isinstance(v, list) and not v) for v in data.values()):
|
2024-09-23 17:44:34 +08:00
|
|
|
|
return list(data.keys())
|
|
|
|
|
else:
|
|
|
|
|
processed = {}
|
|
|
|
|
for k, v in data.items():
|
|
|
|
|
if not contains_number_or_index(k, v):
|
|
|
|
|
processed_v = preprocess_dict(v)
|
|
|
|
|
if processed_v != "": # 只添加非空值
|
|
|
|
|
processed[k] = processed_v
|
|
|
|
|
return processed
|
|
|
|
|
else:
|
|
|
|
|
return {k: preprocess_dict(v) for k, v in data.items()}
|
|
|
|
|
elif isinstance(data, list):
|
|
|
|
|
return [preprocess_dict(item) for item in data]
|
|
|
|
|
else:
|
|
|
|
|
return data
|
2024-09-29 18:01:55 +08:00
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-09-23 17:44:34 +08:00
|
|
|
|
def process_dict(data):
|
2024-10-12 18:01:59 +08:00
|
|
|
|
"""
|
|
|
|
|
递归处理字典,将符合条件的键值对进行转换。
|
|
|
|
|
|
|
|
|
|
如果键是数字或特定格式的字符串,则将其值放入 'items' 列表中并排序。
|
|
|
|
|
对于非数字键,如果对应的值是列表且列表中只有一个元素,则将其展平为单个元素。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
data (dict): 输入的字典数据。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict 或 list 或 原始数据类型: 处理后的数据结构。
|
|
|
|
|
"""
|
2024-09-23 17:44:34 +08:00
|
|
|
|
if not isinstance(data, dict):
|
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
result = {}
|
|
|
|
|
numeric_keys = []
|
|
|
|
|
non_numeric_keys = {}
|
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 分类键为数字键和非数字键
|
2024-09-23 17:44:34 +08:00
|
|
|
|
for key, value in data.items():
|
|
|
|
|
if is_numeric_key(key):
|
|
|
|
|
numeric_keys.append((key, value))
|
|
|
|
|
else:
|
|
|
|
|
non_numeric_keys[key] = value
|
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 处理数字键,将其值递归处理后放入 'items' 列表中
|
2024-09-23 17:44:34 +08:00
|
|
|
|
if numeric_keys:
|
2024-11-01 17:55:26 +08:00
|
|
|
|
def sort_key(item):
|
|
|
|
|
key = item[0]
|
|
|
|
|
if re.match(r'^\d+$', key): # 纯整数
|
|
|
|
|
return (int(key),)
|
|
|
|
|
elif re.match(r'^\d+\.\d+$', key): # 单层小数点
|
|
|
|
|
return (float(key),)
|
|
|
|
|
else: # 多层序号,按字符串处理
|
2024-11-04 10:52:23 +08:00
|
|
|
|
try:
|
|
|
|
|
return tuple(int(part) for part in key.split('.') if part.isdigit())
|
|
|
|
|
except ValueError:
|
|
|
|
|
# 处理无法转换的部分,例如返回一个默认值或记录错误
|
|
|
|
|
return ()
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 按键排序,确保顺序一致
|
2024-11-01 17:55:26 +08:00
|
|
|
|
numeric_keys_sorted = sorted(numeric_keys, key=sort_key)
|
2024-10-12 18:01:59 +08:00
|
|
|
|
result['items'] = [process_dict(item[1]) for item in numeric_keys_sorted]
|
2024-09-23 17:44:34 +08:00
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 处理非数字键
|
2024-09-23 17:44:34 +08:00
|
|
|
|
for key, value in non_numeric_keys.items():
|
2024-09-25 18:03:09 +08:00
|
|
|
|
if isinstance(value, list):
|
|
|
|
|
processed_list = []
|
|
|
|
|
for item in value:
|
|
|
|
|
if isinstance(item, dict):
|
|
|
|
|
# 处理字典中只有一个键值对的情况
|
|
|
|
|
if len(item) == 1:
|
|
|
|
|
processed_item = process_dict(list(item.values())[0])
|
|
|
|
|
else:
|
|
|
|
|
processed_item = process_dict(item)
|
|
|
|
|
else:
|
|
|
|
|
processed_item = process_dict(item)
|
|
|
|
|
|
|
|
|
|
# 如果处理后的项是只包含一个元素的列表,则展平它
|
|
|
|
|
if isinstance(processed_item, list) and len(processed_item) == 1:
|
|
|
|
|
processed_item = processed_item[0]
|
|
|
|
|
|
|
|
|
|
processed_list.append(processed_item)
|
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 新增逻辑:如果 processed_list 只有一个元素,则将其展平为单个元素
|
|
|
|
|
if len(processed_list) == 1:
|
|
|
|
|
result[key] = processed_list[0]
|
|
|
|
|
else:
|
|
|
|
|
result[key] = processed_list
|
2024-09-23 17:44:34 +08:00
|
|
|
|
else:
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 如果值不是列表,直接递归处理
|
2024-09-23 17:44:34 +08:00
|
|
|
|
result[key] = process_dict(value)
|
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 如果结果只有一个键 'items',则直接返回 'items' 列表
|
2024-09-23 17:44:34 +08:00
|
|
|
|
if len(result) == 1 and 'items' in result:
|
|
|
|
|
return result['items']
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
|
|
|
|
# 检查如果所有键对应的值都是空列表,则将键名转换成列表项
|
2024-09-29 18:01:55 +08:00
|
|
|
|
if all(isinstance(v, list) and not v for v in result.values()):
|
|
|
|
|
return list(result.keys())
|
2024-09-23 17:44:34 +08:00
|
|
|
|
|
|
|
|
|
return result
|
2024-09-26 13:43:47 +08:00
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
|
|
|
|
# 查找引用的序号
|
2024-09-26 13:43:47 +08:00
|
|
|
|
def find_chapter_clause_references(data, parent_key=""):
|
2024-10-12 18:01:59 +08:00
|
|
|
|
exclude_list = ["格式要求"]
|
2024-09-26 13:43:47 +08:00
|
|
|
|
result = []
|
|
|
|
|
# 正则匹配"第x章"或"第x款"
|
2024-11-01 17:55:26 +08:00
|
|
|
|
chapter_clause_pattern = re.compile(r'第[一1]+[章款]|公告|邀请')
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 如果数据不是字典,则直接返回空列表
|
|
|
|
|
if not isinstance(data, dict):
|
|
|
|
|
return result
|
2024-09-26 13:43:47 +08:00
|
|
|
|
# 遍历字典中的键值对
|
|
|
|
|
for key, value in data.items():
|
|
|
|
|
# 生成当前的完整键名
|
|
|
|
|
full_key = f"{parent_key}.{key}" if parent_key else key
|
|
|
|
|
|
2024-09-26 18:06:23 +08:00
|
|
|
|
# 检查是否应排除该键或值
|
|
|
|
|
if any(exclude_item in full_key for exclude_item in exclude_list) or \
|
2024-10-12 18:01:59 +08:00
|
|
|
|
(isinstance(value, str) and any(exclude_item in value for exclude_item in exclude_list)):
|
2024-09-26 18:06:23 +08:00
|
|
|
|
continue # 如果在排除项中,跳过处理
|
|
|
|
|
|
2024-09-26 13:43:47 +08:00
|
|
|
|
if isinstance(value, dict):
|
|
|
|
|
# 如果值是字典,递归调用函数
|
|
|
|
|
result.extend(find_chapter_clause_references(value, full_key))
|
2024-10-12 18:01:59 +08:00
|
|
|
|
elif isinstance(value, list):
|
|
|
|
|
# 如果值是列表,遍历列表中的元素
|
|
|
|
|
for index, item in enumerate(value):
|
|
|
|
|
if isinstance(item, dict):
|
|
|
|
|
# 生成新的键路径,包括列表索引
|
|
|
|
|
new_parent_key = f"{full_key}[{index}]"
|
|
|
|
|
result.extend(find_chapter_clause_references(item, new_parent_key))
|
2024-09-26 13:43:47 +08:00
|
|
|
|
elif isinstance(value, str):
|
|
|
|
|
# 如果值是字符串,检查是否匹配"第x章"或"第x款"
|
2024-11-01 17:55:26 +08:00
|
|
|
|
if chapter_clause_pattern.search(value): #符合|应满足|详见
|
|
|
|
|
if "符合"in value or "满足" in value or "详见" in value:
|
|
|
|
|
result.append({full_key: value})
|
2024-09-26 13:43:47 +08:00
|
|
|
|
|
|
|
|
|
return result
|
2024-09-26 14:08:34 +08:00
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-09-26 18:06:23 +08:00
|
|
|
|
def preprocess_value(value):
|
|
|
|
|
# 使用正则表达式查找"第X章"或"第X款"
|
|
|
|
|
chapter_match = re.search(r'第(.+?)章', value)
|
|
|
|
|
clause_match = re.search(r'第(.+?)款', value)
|
|
|
|
|
|
|
|
|
|
if chapter_match or clause_match:
|
|
|
|
|
# 以逗号、句号、问号、感叹号为分隔符
|
|
|
|
|
separators = r'[,。?!,\?!]'
|
|
|
|
|
|
|
|
|
|
# 分隔符检测函数,确保括号成对闭合时才用作分隔符
|
|
|
|
|
def is_separator(ch, count):
|
|
|
|
|
return count['('] == count[')'] and count['('] == count[')'] and re.match(separators, ch)
|
|
|
|
|
|
|
|
|
|
parts = []
|
|
|
|
|
current_part = []
|
|
|
|
|
count = {'(': 0, ')': 0, '(': 0, ')': 0}
|
|
|
|
|
|
|
|
|
|
for ch in value:
|
|
|
|
|
if ch in count:
|
|
|
|
|
count[ch] += 1
|
|
|
|
|
if is_separator(ch, count):
|
|
|
|
|
parts.append("".join(current_part).strip())
|
|
|
|
|
current_part = []
|
|
|
|
|
else:
|
|
|
|
|
current_part.append(ch)
|
|
|
|
|
|
|
|
|
|
if current_part:
|
|
|
|
|
parts.append("".join(current_part).strip())
|
|
|
|
|
|
|
|
|
|
# 查找包含章节或条款的部分
|
|
|
|
|
target_part = next((part for part in parts if '章' in part or '款' in part), None)
|
|
|
|
|
|
|
|
|
|
if target_part:
|
|
|
|
|
# 删除开头的"符合"或"应满足"
|
2024-11-01 17:55:26 +08:00
|
|
|
|
target_part = re.sub(r'^(符合|应满足|详见)\s*', '', target_part.strip())
|
2024-09-26 18:06:23 +08:00
|
|
|
|
return target_part
|
|
|
|
|
|
|
|
|
|
# 如果没有找到特定章节或条款,返回原始值
|
|
|
|
|
return value
|
|
|
|
|
|
2024-10-30 18:08:46 +08:00
|
|
|
|
#[{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料。'}]
|
2024-09-26 18:06:23 +08:00
|
|
|
|
def generate_questions(input_list):
|
|
|
|
|
template = (
|
2024-10-15 20:57:58 +08:00
|
|
|
|
"关于{modified_key},{value}的内容是怎样的?请按json格式给我提供信息,"
|
|
|
|
|
"键名为'{original_key}',而键值需要完全与原文保持一致,不要擅自总结、删减,"
|
|
|
|
|
"如果存在未知信息,请在对应键值处填'未知'。"
|
2024-09-26 18:06:23 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
questions = []
|
|
|
|
|
for input_dict in input_list:
|
2024-10-15 20:57:58 +08:00
|
|
|
|
for original_key, value in input_dict.items():
|
|
|
|
|
# 将第一个 '.' 替换为 '中的'
|
|
|
|
|
if '.' in original_key:
|
|
|
|
|
modified_key = original_key.replace('.', '中的', 1)
|
|
|
|
|
else:
|
|
|
|
|
modified_key = original_key # 如果没有 '.', 保持不变
|
|
|
|
|
processed_value = preprocess_value(value) # 假设这是你需要的预处理函数
|
|
|
|
|
question = template.format(modified_key=modified_key, original_key=original_key, value=processed_value)
|
2024-09-26 18:06:23 +08:00
|
|
|
|
questions.append(question)
|
|
|
|
|
return questions
|
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
eg:
|
|
|
|
|
response_list = [
|
|
|
|
|
{
|
|
|
|
|
"person.name": "Bob",
|
|
|
|
|
"person.address.city": "Los Angeles"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"company.location": "Austin",
|
|
|
|
|
"person.age": 35
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 用新数据更新原有字典
|
|
|
|
|
def update_json_data(original_data, response_list):
|
2024-09-26 18:06:23 +08:00
|
|
|
|
def recursive_update(data, key, value):
|
|
|
|
|
# 处理点分隔的键,递归定位并更新嵌套字典
|
|
|
|
|
keys = key.split('.')
|
|
|
|
|
for k in keys[:-1]:
|
|
|
|
|
data = data.setdefault(k, {})
|
|
|
|
|
if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
|
|
|
|
|
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
|
|
|
|
|
else:
|
|
|
|
|
data[keys[-1]] = value
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-09-26 18:06:23 +08:00
|
|
|
|
for response_dict in response_list:
|
|
|
|
|
for key, value in response_dict.items():
|
|
|
|
|
recursive_update(original_data, key, value)
|
|
|
|
|
return original_data
|
2024-09-26 14:08:34 +08:00
|
|
|
|
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
|
|
|
|
def process_match_keys(match_keys, clause_path_file):
|
|
|
|
|
"""
|
|
|
|
|
处理 match_keys,根据其中的数字或中文数字提取相应的条款内容,并附加到原始值后面。
|
|
|
|
|
|
|
|
|
|
参数:
|
|
|
|
|
- match_keys (list): 包含键值对的列表。
|
|
|
|
|
- clause_path_file (str): clause_path的JSON文件路径。
|
|
|
|
|
|
|
|
|
|
返回:
|
|
|
|
|
- list: 更新后的match_keys列表。
|
|
|
|
|
"""
|
|
|
|
|
# 定义数字到中文数字的映射,扩展到'十'
|
2024-10-30 18:08:46 +08:00
|
|
|
|
digit_map = {'1': '一', '2': '二', '3': '三', '4': '四', '5': '五', '6': '六', '7': '七', '8': '八', '9': '九',
|
|
|
|
|
'10': '十'}
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# 定义中文数字列表
|
|
|
|
|
chinese_numerals = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']
|
|
|
|
|
|
|
|
|
|
# 编译一个正则表达式,用于查找中文数字后面跟着的不是'章'或'部分'的字符
|
|
|
|
|
# 这个模式会捕获中文数字和紧随其后的一个字符
|
|
|
|
|
pattern = re.compile(r'([一二三四五六七八九十]+)(?!章|部分)(.)')
|
|
|
|
|
|
|
|
|
|
# 读取clause_path的内容
|
|
|
|
|
try:
|
|
|
|
|
with open(clause_path_file, 'r', encoding='utf-8') as file:
|
|
|
|
|
clause_path = json.load(file)
|
|
|
|
|
except FileNotFoundError:
|
|
|
|
|
print(f"文件未找到: {clause_path_file}")
|
|
|
|
|
return match_keys
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
|
print(f"文件内容不是有效的JSON格式: {clause_path_file}")
|
|
|
|
|
return match_keys
|
|
|
|
|
|
|
|
|
|
for item in match_keys:
|
|
|
|
|
for key, value in item.items():
|
|
|
|
|
# 将match_keys中的数字1-10转换为对应的中文数字
|
|
|
|
|
for digit, chinese in digit_map.items():
|
|
|
|
|
value = re.sub(r'{}'.format(digit), chinese, value)
|
|
|
|
|
|
|
|
|
|
# 查找值中所有匹配的中文数字
|
|
|
|
|
matches = pattern.findall(value)
|
|
|
|
|
# 存储需要附加的条款内容,避免重复
|
|
|
|
|
clauses_to_append = []
|
|
|
|
|
for match in matches:
|
|
|
|
|
numeral = match[0]
|
|
|
|
|
# 检查提取的中文数字是否在定义的列表中
|
|
|
|
|
if numeral in chinese_numerals:
|
|
|
|
|
# 在clause_path的键中查找包含该中文数字的键
|
|
|
|
|
for clause_key in clause_path.keys():
|
|
|
|
|
if numeral in clause_key:
|
|
|
|
|
clause_value = clause_path[clause_key]
|
|
|
|
|
if clause_value not in clauses_to_append:
|
|
|
|
|
clauses_to_append.append(clause_value)
|
|
|
|
|
if clauses_to_append:
|
|
|
|
|
# 将找到的条款内容用换行符连接
|
|
|
|
|
appended_text = '\n'.join(clauses_to_append)
|
|
|
|
|
# 更新当前项的值,添加换行和附加内容
|
|
|
|
|
item[key] = value + '\n' + appended_text
|
|
|
|
|
return match_keys
|
|
|
|
|
|
|
|
|
|
|
2024-10-30 18:08:46 +08:00
|
|
|
|
# 处理如'符合本采购文件第一章第二款要求'的情况,跳转到指定地方摘取内容
|
|
|
|
|
def process_additional_queries(combined_res, match_keys, output_folder, notice_path, invalid_path):
|
2024-11-01 17:55:26 +08:00
|
|
|
|
# print(match_keys)
|
2024-10-12 18:01:59 +08:00
|
|
|
|
"""
|
|
|
|
|
处理额外的查询并更新结果。
|
|
|
|
|
|
|
|
|
|
Args:
|
|
|
|
|
combined_res: 初始的组合结果。
|
|
|
|
|
match_keys: 匹配的章节或条款引用。 [{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料。'}]
|
2024-10-14 10:52:31 +08:00
|
|
|
|
output_folder: 输出文件夹路径。
|
|
|
|
|
notice_path: 通知文件路径。
|
2024-10-12 18:01:59 +08:00
|
|
|
|
knowledge_name: 知识库的名称。
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
dict: 更新后的最终结果。
|
|
|
|
|
"""
|
2024-10-30 18:08:46 +08:00
|
|
|
|
# 对于空的notice_path的情况,此处做了异常处理
|
2024-10-14 10:52:31 +08:00
|
|
|
|
clause2_path = convert_clause_to_json(notice_path, output_folder, 2)
|
2024-11-01 17:55:26 +08:00
|
|
|
|
new_match_keys = copy.deepcopy(match_keys)
|
|
|
|
|
updated_match_keys = process_match_keys(new_match_keys, clause2_path)
|
2024-10-14 10:52:31 +08:00
|
|
|
|
if updated_match_keys != match_keys:
|
|
|
|
|
form_response_dict = update_json_data(combined_res, updated_match_keys)
|
|
|
|
|
else:
|
2024-10-15 20:57:58 +08:00
|
|
|
|
# 招标公告没找到内容,继续问大模型
|
2024-10-14 10:52:31 +08:00
|
|
|
|
ques = generate_questions(match_keys)
|
2024-10-30 18:08:46 +08:00
|
|
|
|
file_id = upload_file(invalid_path)
|
2024-10-15 20:57:58 +08:00
|
|
|
|
qianwen_results = multi_threading(ques, "", file_id, 2) # 1代表使用百炼rag 2代表使用qianwen-long
|
|
|
|
|
updated_match_keys = [clean_json_string(res) for _, res in qianwen_results] if qianwen_results else []
|
2024-10-14 10:52:31 +08:00
|
|
|
|
form_response_dict = update_json_data(combined_res, updated_match_keys)
|
|
|
|
|
|
|
|
|
|
# 添加额外的处理步骤
|
|
|
|
|
final_result = {"资格审查": form_response_dict}
|
|
|
|
|
return final_result
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-10-30 18:08:46 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
def combine_qualification_review(invalid_path, qualification_path, notice_path):
|
|
|
|
|
detailed_res = {}
|
2024-10-17 15:33:58 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# 初始化无效文件ID
|
|
|
|
|
invalid_file_id = None
|
|
|
|
|
first_res = {}
|
|
|
|
|
if qualification_path:
|
|
|
|
|
# 上传资格文件并获取文件ID
|
|
|
|
|
qualification_file_id = upload_file(qualification_path)
|
2024-11-05 16:29:32 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# 定义第一个查询,用于检查资格性审查和符合性审查是否存在
|
2024-11-05 16:29:32 +08:00
|
|
|
|
first_query = """
|
2024-11-07 15:46:24 +08:00
|
|
|
|
该文档中是否有关于资格性审查标准的具体内容,是否有关于符合性审查标准的具体内容?请以json格式给出回答,外键分别为'资格性审查'和'符合性审查',键值仅限于'是','否',输出格式示例如下:
|
2024-11-05 16:29:32 +08:00
|
|
|
|
{
|
2024-11-07 15:46:24 +08:00
|
|
|
|
"资格性审查":"是",
|
|
|
|
|
"符合性审查":"是"
|
2024-11-05 16:29:32 +08:00
|
|
|
|
}
|
2024-11-07 15:46:24 +08:00
|
|
|
|
"""
|
2024-10-22 21:02:54 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# 执行第一个查询并清洗返回的JSON字符串
|
|
|
|
|
print("call first_query")
|
|
|
|
|
first_res = clean_json_string(qianwen_long(qualification_file_id, first_query))
|
2024-10-17 15:33:58 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# 判断是否存在资格性和符合性审查
|
|
|
|
|
zige_file_id = qualification_file_id if first_res.get("资格性审查") == "是" else None
|
|
|
|
|
fuhe_file_id = qualification_file_id if first_res.get("符合性审查") == "是" else None
|
2024-10-17 15:33:58 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# 如果需要,上传无效文件
|
|
|
|
|
if zige_file_id is None or fuhe_file_id is None:
|
|
|
|
|
if invalid_file_id is None:
|
|
|
|
|
invalid_file_id = upload_file(invalid_path)
|
|
|
|
|
if zige_file_id is None:
|
|
|
|
|
zige_file_id = invalid_file_id
|
|
|
|
|
if fuhe_file_id is None:
|
|
|
|
|
fuhe_file_id = invalid_file_id
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
else:
|
|
|
|
|
# 如果 qualification_path 为空,直接使用无效文件
|
|
|
|
|
zige_file_id = fuhe_file_id = upload_file(invalid_path)
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# 定义第二组查询
|
|
|
|
|
second_query = [
|
|
|
|
|
{
|
|
|
|
|
"key": "资格性审查",
|
|
|
|
|
"query": "该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性审查的内容。"
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"key": "符合性审查",
|
|
|
|
|
"query": "该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"
|
|
|
|
|
}
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# 定义任务函数
|
|
|
|
|
def process_second_query(key, query, file_id):
|
|
|
|
|
print("call second_query")
|
|
|
|
|
try:
|
|
|
|
|
res = qianwen_long(file_id, query)
|
|
|
|
|
cleaned_res = clean_json_string(res)
|
|
|
|
|
return key, cleaned_res.get(key, "未找到相关内容")
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"执行查询 '{key}' 时出错: {e}")
|
|
|
|
|
return key, "查询失败"
|
|
|
|
|
|
|
|
|
|
def process_notice(notice_path):
|
|
|
|
|
print("call notice_path")
|
|
|
|
|
try:
|
|
|
|
|
# 上传通知文件并获取文件ID
|
|
|
|
|
file_id1 = upload_file(notice_path)
|
|
|
|
|
|
|
|
|
|
# 定义用户查询,提取申请人资格要求
|
|
|
|
|
user_query1 = """
|
|
|
|
|
该文档中说明的申请人资格要求是怎样的?请以json格式给出回答,外键为'申请人资格要求',键值为字符串列表,其中每个字符串对应原文中的一条要求,你的回答与原文内容一致,不要擅自总结删减。输出格式示例如下:
|
|
|
|
|
{
|
|
|
|
|
"申请人资格要求":[
|
|
|
|
|
"1.满足《中华人民共和国政府采购法》第二十二条规定;",
|
|
|
|
|
"1.1 法人或者其他组织的营业执照等证明文件,如供应商是自然人的提供身份证明材料;",
|
|
|
|
|
"2.未被列入“信用中国”网站(www.creditchina.gov.cn)信用服务栏失信被执行人、重大税收违法案件当事人名单;"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
# 执行查询并清洗结果
|
|
|
|
|
res1 = clean_json_string(qianwen_long(file_id1, user_query1))
|
|
|
|
|
|
|
|
|
|
# 提取申请人资格要求
|
|
|
|
|
requirements = res1.get("申请人资格要求", "未找到相关内容")
|
|
|
|
|
return "申请人资格要求", requirements
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f"处理申请人资格要求时出错: {e}")
|
|
|
|
|
return "申请人资格要求", "处理失败"
|
|
|
|
|
|
|
|
|
|
# 初始化 ThreadPoolExecutor
|
|
|
|
|
with ThreadPoolExecutor(max_workers=3) as executor:
|
|
|
|
|
future_to_key = {}
|
|
|
|
|
|
|
|
|
|
# 提交第二组查询
|
|
|
|
|
for query_info in second_query:
|
|
|
|
|
key = query_info["key"]
|
|
|
|
|
query = query_info["query"]
|
|
|
|
|
current_file_id = zige_file_id if key == "资格性审查" else fuhe_file_id
|
|
|
|
|
future = executor.submit(process_second_query, key, query, current_file_id)
|
|
|
|
|
future_to_key[future] = key
|
|
|
|
|
|
|
|
|
|
# 有条件地提交通知处理
|
|
|
|
|
if qualification_path and notice_path and first_res.get("资格性审查") == "是":
|
|
|
|
|
future = executor.submit(process_notice, notice_path)
|
|
|
|
|
future_to_key[future] = "申请人资格要求"
|
|
|
|
|
|
|
|
|
|
# 收集结果(按完成顺序)
|
|
|
|
|
for future in as_completed(future_to_key):
|
|
|
|
|
key, result = future.result()
|
|
|
|
|
detailed_res[key] = result
|
|
|
|
|
|
|
|
|
|
# 定义所需的顺序
|
|
|
|
|
desired_order = ["申请人资格要求", "资格性审查", "符合性审查"]
|
|
|
|
|
# print(json.dumps(detailed_res,ensure_ascii=False,indent=4))
|
|
|
|
|
# 创建一个新的有序字典
|
|
|
|
|
ordered_res = {}
|
|
|
|
|
for key in desired_order:
|
|
|
|
|
if key in detailed_res:
|
|
|
|
|
ordered_res[key] = detailed_res[key]
|
|
|
|
|
|
|
|
|
|
# 将重新排序后的字典传递给处理函数
|
|
|
|
|
processed_data = process_dict(preprocess_dict(ordered_res))
|
|
|
|
|
|
|
|
|
|
# 最终处理结果,例如打印或保存
|
2024-11-07 16:31:57 +08:00
|
|
|
|
return {"资格审查": processed_data}
|
2024-11-07 15:46:24 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# def combine_qualification_review(invalid_path, output_folder, qualification_path, notice_path):
|
|
|
|
|
# DEFAULT_QUALIFICATION_REVIEW = {
|
|
|
|
|
# "资格审查": {
|
|
|
|
|
# "资格审查": "",
|
|
|
|
|
# "符合性审查": ""
|
|
|
|
|
# }
|
|
|
|
|
# }
|
|
|
|
|
#
|
|
|
|
|
# def process_file(file_path, invalid_path):
|
|
|
|
|
# file_id = upload_file(file_path)
|
|
|
|
|
# first_query = """
|
|
|
|
|
# 该文档中是否有关于资格性审查标准的具体内容,是否有关于符合性审查标准的具体内容?请以json格式给出回答,外键分别为'资格性审查'和'符合性审查',键值仅限于'是','否',输出格式示例如下:
|
|
|
|
|
# {
|
|
|
|
|
# "资格性审查":"是",
|
|
|
|
|
# "符合性审查":"是"
|
|
|
|
|
# }
|
|
|
|
|
# """
|
|
|
|
|
# qianwen_ans = clean_json_string(qianwen_long(file_id, first_query))
|
|
|
|
|
# user_queries = [
|
|
|
|
|
# {
|
|
|
|
|
# "key": "资格性审查",
|
|
|
|
|
# "query": "该招标文件中规定的资格性审查标准是怎样的?请以json格式给出,外层为'资格性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。"
|
|
|
|
|
# },
|
|
|
|
|
# {
|
|
|
|
|
# "key": "符合性审查",
|
|
|
|
|
# "query": "该招标文件中规定的符合性审查标准是怎样的?请以json格式给出,外层为'符合性审查',你的回答要与原文完全一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"
|
|
|
|
|
# }
|
|
|
|
|
# ]
|
|
|
|
|
# combined_res = {}
|
|
|
|
|
# file_id2 = None # 延迟上传 invalid_path
|
|
|
|
|
# def process_single_query(query_info):
|
|
|
|
|
# nonlocal file_id2
|
|
|
|
|
# key = query_info["key"]
|
|
|
|
|
# query = query_info["query"]
|
|
|
|
|
# # 根据键值决定使用哪个 file_id
|
|
|
|
|
# if qianwen_ans.get(key) == "否":
|
|
|
|
|
# print("no")
|
|
|
|
|
# if not file_id2:
|
|
|
|
|
# file_id2 = upload_file(invalid_path)
|
|
|
|
|
# current_file_id = file_id2
|
|
|
|
|
# else:
|
|
|
|
|
# current_file_id = file_id
|
|
|
|
|
#
|
|
|
|
|
# # 调用大模型获取回答
|
|
|
|
|
# ans = qianwen_long(current_file_id, query)
|
|
|
|
|
# cleaned_data = clean_json_string(ans)
|
|
|
|
|
# processed = process_dict(preprocess_dict(cleaned_data))
|
|
|
|
|
# return processed
|
|
|
|
|
#
|
|
|
|
|
# # 使用线程池并行处理查询
|
|
|
|
|
# with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
|
|
|
|
|
# futures = [executor.submit(process_single_query, q) for q in user_queries]
|
|
|
|
|
# for future in concurrent.futures.as_completed(futures):
|
|
|
|
|
# result = future.result()
|
|
|
|
|
# combined_res.update(result)
|
|
|
|
|
# return combined_res
|
|
|
|
|
#
|
|
|
|
|
# try:
|
|
|
|
|
# if not qualification_path:
|
|
|
|
|
# file_to_process = invalid_path
|
|
|
|
|
# else:
|
|
|
|
|
# file_to_process = qualification_path
|
|
|
|
|
#
|
|
|
|
|
# combined_res = process_file(file_to_process,invalid_path)
|
|
|
|
|
# match_keys = find_chapter_clause_references(combined_res)
|
|
|
|
|
#
|
|
|
|
|
# if not match_keys:
|
|
|
|
|
# return {"资格审查": combined_res}
|
|
|
|
|
#
|
|
|
|
|
# return process_additional_queries(combined_res, match_keys, output_folder, notice_path,invalid_path) #还要跳转到第一章
|
|
|
|
|
#
|
|
|
|
|
# except Exception as e:
|
|
|
|
|
# print(f"Error in combine_qualification_review: {e}")
|
|
|
|
|
# return DEFAULT_QUALIFICATION_REVIEW.copy()
|
2024-10-12 18:01:59 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 整合基础信息核心代码
|
|
|
|
|
# [{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}, {'资格性审查.没有重大违法记录的书面声明': '是否提交参加政府采购活动前三年内在经营活动中没有重大违法记录的书面承诺或声明(格式要求详见本项目采购文件第六章相关格式要求)'}]
|
2024-09-23 15:49:30 +08:00
|
|
|
|
if __name__ == "__main__":
|
2024-11-07 15:46:24 +08:00
|
|
|
|
start_time=time.time()
|
2024-10-12 18:01:59 +08:00
|
|
|
|
# qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf"
|
2024-11-06 14:07:21 +08:00
|
|
|
|
# output_folder = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89"
|
|
|
|
|
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5"
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# qualification_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_qualification1.pdf"
|
2024-11-01 17:55:26 +08:00
|
|
|
|
# qualification_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_qualification2.pdf"
|
2024-11-07 15:46:24 +08:00
|
|
|
|
qualification_path="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf"
|
2024-11-01 17:55:26 +08:00
|
|
|
|
# notice_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_notice.pdf"
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# notice_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_notice.pdf"
|
|
|
|
|
notice_path="C:\\Users\\Administrator\\Desktop\\货物标\\output5\\6.2定版视频会议磋商文件_notice.pdf"
|
2024-10-15 20:57:58 +08:00
|
|
|
|
# knowledge_name = "6.2视频会议docx"
|
2024-11-06 14:07:21 +08:00
|
|
|
|
# invalid_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
|
2024-11-07 15:46:24 +08:00
|
|
|
|
# invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile.pdf"
|
|
|
|
|
invalid_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件.pdf"
|
|
|
|
|
res = combine_qualification_review(invalid_path, qualification_path, notice_path)
|
2024-10-12 18:01:59 +08:00
|
|
|
|
print(json.dumps(res, ensure_ascii=False, indent=4))
|
2024-11-07 15:46:24 +08:00
|
|
|
|
end_time=time.time()
|
|
|
|
|
print("耗时:"+str(end_time-start_time))
|