9.26 分段解析完全版

This commit is contained in:
zy123 2024-09-26 18:06:23 +08:00
parent b653d97c54
commit e6cedef2c2
7 changed files with 119 additions and 138 deletions

View File

@ -1,124 +1,19 @@
import concurrent.futures
import json
import time
import sys
def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path):
time.sleep(2) # 模拟任务耗时5秒
return {"basic_info": "Project basic info"}
def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path, input_file_path, output_folder):
time.sleep(4) # 模拟任务耗时10秒
return {"qualification_review": "Qualification review"}
def fetch_evaluation_standards(truncate1):
time.sleep(6) # 模拟任务耗时15秒
return {"technical_standards": "Technical standards", "commercial_standards": "Commercial standards"}
def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
time.sleep(8) # 模拟任务耗时20秒
return {"invalid_requirements": "Invalid requirements"}
def fetch_bidding_documents_requirements(clause_path):
time.sleep(10) # 模拟任务耗时25秒
return {"bidding_documents_requirements": "Bidding documents requirements"}
def fetch_bid_opening(clause_path):
time.sleep(12) # 模拟任务耗时30秒
return {"opening_bid": "Opening bid"}
def transform_json_values(data):
# 假设这个函数对数据进行某些转换,简单返回数据
return data
def get_global_logger(unique_id):
import logging
logger = logging.getLogger(unique_id)
return logger
def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
# 模拟返回处理后的数据
return {
'knowledge_name': 'KnowledgeName_' + unique_id,
'truncate0': 'truncate0',
'truncate1': 'truncate1',
'truncate3': 'truncate3',
'truncate0_jsonpath': 'truncate0_jsonpath',
'clause_path': 'clause_path',
'invalid_docpath': 'invalid_docpath',
'input_file_path': 'input_file_path',
'output_folder': output_folder
}
def deleteKnowledge(knowledge_index):
# 模拟删除操作
pass
def main_process(output_folder, downloaded_file_path, file_type, unique_id):
global logger
logger = get_global_logger(unique_id)
# 预处理文件,获取处理后的数据
processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
if not processed_data:
yield json.dumps({}) # 如果处理数据失败,返回空的 JSON
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'],
processed_data['truncate0'], output_folder,
processed_data['clause_path']),
'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
processed_data['truncate3'],
processed_data['knowledge_name'],
processed_data['truncate0_jsonpath'],
processed_data['clause_path'], processed_data['input_file_path'],
processed_data['output_folder']),
'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
output_folder, processed_data['truncate0_jsonpath'],
processed_data['clause_path'], processed_data['truncate3']),
'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,
processed_data['clause_path']),
'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
}
# 按照任务完成的顺序返回结果
for future in concurrent.futures.as_completed(futures.values()):
key = next(k for k, v in futures.items() if v == future)
try:
result = future.result()
# 记录哪个任务完成了
logger.info(f"Task {key} completed.")
# 处理 evaluation_standards 返回的技术标和商务标分别作为两个结果
if key == 'evaluation_standards':
technical_standards = result["technical_standards"]
commercial_standards = result["commercial_standards"]
# 分别发送技术标和商务标
modified_technical_result = transform_json_values({"technical_standards": technical_standards})
modified_commercial_result = transform_json_values({"commercial_standards": commercial_standards})
yield json.dumps(modified_technical_result, ensure_ascii=False) # 直接返回 JSON 数据
yield json.dumps(modified_commercial_result, ensure_ascii=False) # 直接返回 JSON 数据
sys.stdout.flush() # 强制刷新流式数据
else:
modified_result = transform_json_values({key: result})
yield json.dumps(modified_result, ensure_ascii=False) # 直接返回 JSON 数据
sys.stdout.flush() # 强制刷新流式数据
except Exception as exc:
logger.error(f"Error processing {key}: {exc}")
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
sys.stdout.flush()
# 删除知识索引
deleteKnowledge(processed_data['knowledge_name'])
# 调用主函数,模拟执行
from flask_app.main.json_utils import extract_content_from_json
from flask_app.main.多线程提问 import multi_threading
from flask_app.货物标.资格审查main import update_json_data
ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的请按json格式给我提供信息键名为'资格性审查.资格要求',你不能回答而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"]
knowledge_name="6.2视频会议docx"
results = multi_threading(ques, knowledge_name) # 无序号的直接问大模型
first_response_list = []
for _, response in results:
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
temp = extract_content_from_json(response[1])
first_response_list.append(temp)
else:
print(f"形式响应评审Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"形式响应评审Error processing response for query index {_}: {e}")
print(first_response_list)

View File

@ -137,7 +137,9 @@ def reformat_questions(match_keys, input_path, output_folder):
# 初始化文件处理
generated_files = None
file_processed = False
question_template = (
"关于'{key}'{revised_value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
)
for entry in match_keys:
key, value = next(iter(entry.items()))
match = pattern.search(value)
@ -156,14 +158,14 @@ def reformat_questions(match_keys, input_path, output_folder):
entries_with_numbers2.append({key: num})
else:
revised_value = value.replace('符合', '')
formatted_entry = f"关于'{key}'{revised_value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
formatted_entry = question_template.format(key=key, revised_value=revised_value)
formatted_questions.append(formatted_entry)
elif match:
num = match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
entries_with_numbers.append({key: num})
else:
revised_value = value.replace('符合', '')
formatted_entry = f"关于'{key}'{revised_value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
formatted_entry = question_template.format(key=key, revised_value=revised_value)
formatted_questions.append(formatted_entry)
return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path

View File

@ -265,7 +265,7 @@ def truncate_pdf_multiple(input_path, output_folder):
if __name__ == "__main__":
# input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74\\ztbfile.pdf"
# output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4bda9fde-89fc-4e5e-94a4-ce6c43010f74"
input_path="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf"
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest17.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp"
truncate_pdf_multiple(input_path,output_folder)
# selection = 3 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标

View File

@ -292,9 +292,6 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
# 删除知识索引
deleteKnowledge(index)
#TODO:流式输出要改商务标技术标整合
if __name__ == "__main__":
output_folder = "flask_app/static/output/zytest1"

View File

@ -340,7 +340,7 @@ def main(base_directory, intermediate_directory, final_directory):
os.makedirs(final_directory, exist_ok=True)
process_directory(base_directory, intermediate_directory, final_directory)
#TODO后期可加多线程进行处理
if __name__ == "__main__":
base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
intermediate_directory = 'D:\\tmp'

View File

@ -178,7 +178,6 @@ def find_forbidden(truncate_json_path,clause_path,truncate3): #投标人须
forbidden_dict={'不得存在的其他情形':merged_forbidden_list}
return forbidden_dict
#TODO:不得存在的情况文中有很多内容
if __name__ == '__main__':
truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\796f7bb3-2f7a-4332-b044-9d817a07861e\\truncate_output.json"

View File

@ -7,6 +7,7 @@ from flask_app.main.通义千问long import qianwen_long,upload_file
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.json_utils import combine_json_results,clean_json_string
from flask_app.main.形式响应评审 import update_json_data,extract_matching_keys
from flask_app.main.json_utils import extract_content_from_json
#这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除重新组织成一个字典格式的数据你可以考虑用字符串列表来保持部分平级的数据
#对于同级的键,如果数量>1且键名都统一那么将键名去掉用列表保持它们的键值
#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存
@ -102,6 +103,7 @@ def process_dict(data):
def find_chapter_clause_references(data, parent_key=""):
exclude_list=["格式要求"]
result = []
# 正则匹配"第x章"或"第x款"
chapter_clause_pattern = re.compile(r'第[一二三四五六七八九十\d]+[章款]')
@ -111,6 +113,11 @@ def find_chapter_clause_references(data, parent_key=""):
# 生成当前的完整键名
full_key = f"{parent_key}.{key}" if parent_key else key
# 检查是否应排除该键或值
if any(exclude_item in full_key for exclude_item in exclude_list) or \
(isinstance(value, str) and any(exclude_item in value for exclude_item in exclude_list)):
continue # 如果在排除项中,跳过处理
if isinstance(value, dict):
# 如果值是字典,递归调用函数
result.extend(find_chapter_clause_references(value, full_key))
@ -121,25 +128,106 @@ def find_chapter_clause_references(data, parent_key=""):
return result
# def reformat_questions(match_keys):
def preprocess_value(value):
# 使用正则表达式查找"第X章"或"第X款"
chapter_match = re.search(r'第(.+?)章', value)
clause_match = re.search(r'第(.+?)款', value)
def qualification_review(truncate_file):
if chapter_match or clause_match:
# 以逗号、句号、问号、感叹号为分隔符
separators = r'[,。?!,\?!]'
# 分隔符检测函数,确保括号成对闭合时才用作分隔符
def is_separator(ch, count):
return count['('] == count[')'] and count[''] == count[''] and re.match(separators, ch)
parts = []
current_part = []
count = {'(': 0, ')': 0, '': 0, '': 0}
for ch in value:
if ch in count:
count[ch] += 1
if is_separator(ch, count):
parts.append("".join(current_part).strip())
current_part = []
else:
current_part.append(ch)
if current_part:
parts.append("".join(current_part).strip())
# 查找包含章节或条款的部分
target_part = next((part for part in parts if '' in part or '' in part), None)
if target_part:
# 删除开头的"符合"或"应满足"
target_part = re.sub(r'^(符合|应满足)\s*', '', target_part.strip())
return target_part
# 如果没有找到特定章节或条款,返回原始值
return value
def generate_questions(input_list):
template = (
"关于'{key}',{value}的内容是怎样的请按json格式给我提供信息键名为'{key}',而键值需要完全与原文保持一致,不要擅自总结、删减,如果存在未知信息,请在对应键值处填'未知'"
)
questions = []
for input_dict in input_list:
for key, value in input_dict.items():
processed_value = preprocess_value(value)
question = template.format(key=key, value=processed_value)
questions.append(question)
return questions
def update_json_data(original_data,response_list):
def recursive_update(data, key, value):
# 处理点分隔的键,递归定位并更新嵌套字典
keys = key.split('.')
for k in keys[:-1]:
data = data.setdefault(k, {})
if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
data[keys[-1]] = {**data.get(keys[-1], {}), **value}
else:
data[keys[-1]] = value
for response_dict in response_list:
for key, value in response_dict.items():
recursive_update(original_data, key, value)
return original_data
def qualification_review(truncate_file,knowledge_name):
file_id=upload_file(truncate_file)
user_query=["该招标文件中规定的资格性审查标准是怎样的请以json格式给出外层为'资格性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答有关符合性性审查的内容。","该招标文件中规定的符合性审查标准是怎样的请以json格式给出外层为'符合性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答有关资格性审查的内容。"]
results=multi_threading(user_query,"",file_id,2)
combined_res = {}
for question, response in results:
print(response)
cleaned_data = clean_json_string(response)
processed1 = preprocess_dict(cleaned_data)
processed2 = process_dict(processed1)
combined_res.update(processed2)
result=find_chapter_clause_references(combined_res)
print(result)
match_keys=find_chapter_clause_references(combined_res)
ques=generate_questions(match_keys)
results = multi_threading(ques, knowledge_name) # 无序号的直接问大模型
first_response_list = []
for _, response in results:
try:
if response and len(response) > 1: # 检查response存在且有至少两个元素
temp = extract_content_from_json(response[1])
first_response_list.append(temp)
else:
print(f"形式响应评审Warning: Missing or incomplete response data for query index {_}.")
except Exception as e:
print(f"形式响应评审Error processing response for query index {_}: {e}")
# print(first_response_list)
final_result=update_json_data(combined_res,first_response_list)
return final_result
# 整合基础信息核心代码
return combined_res
#[{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}, {'资格性审查.没有重大违法记录的书面声明': '是否提交参加政府采购活动前三年内在经营活动中没有重大违法记录的书面承诺或声明(格式要求详见本项目采购文件第六章相关格式要求)'}]
#TODO:有个严重的问题,对于{'资格性审查.资格要求': '符合本采购文件第一章第二款要求,并提供合格有效的证明材料'}调用百炼rag的时候容易得到一模一样的回答而不是跳转到具体的地方有两个思路1.结构化第一章内容 2.优化提示词 3.构造问题的时候不带value直接问key
if __name__ == "__main__":
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\6.2定版视频会议磋商文件_qualification2.pdf"
res=qualification_review(truncate_file)
knowledge_name="6.2视频会议docx"
res=qualification_review(truncate_file,knowledge_name)
print(json.dumps(res,ensure_ascii=False, indent=4))