9.23测试分段

This commit is contained in:
zy123 2024-09-23 15:49:30 +08:00
parent 8789bb540a
commit 60f17e32bf
11 changed files with 230 additions and 40 deletions

View File

@ -261,7 +261,20 @@ def test_process_and_stream():
'data': data
}
yield f"data: {json.dumps(response, ensure_ascii=False)}\n\n"
time.sleep(5) # 每隔5秒发送一段数据
time.sleep(3) # 每隔5秒发送一段数据
# 在结束信号之前发送完整的数据
combined_data = {}
for segment in data_segments:
combined_data.update(segment) # 将所有段落数据合并成一个大字典
# 发送完整的大字典
complete_response = {
'message': 'Combined data',
'filename': filename,
'data': combined_data
}
yield f"data: {json.dumps(complete_response, ensure_ascii=False)}\n\n"
# 发送结束信号
final_response = {

View File

@ -107,7 +107,7 @@ def combine_evaluation_standards(truncate2):
update_json = combine_technical_and_business(clean_json_string(evaluation_res), target_values1)
evaluation_combined_res = json.dumps(update_json,ensure_ascii=False,indent=4)
return evaluation_combined_res
# return update_json #商务标技术标整合
if __name__ == "__main__":
truncate2="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_evaluation_method.pdf"
res=combine_evaluation_standards(truncate2)

View File

@ -110,6 +110,18 @@ def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_j
# 评分细则
# def fetch_evaluation_standards(truncate1): # 评标办法前附表
# logger.info("starting 商务标和技术标...")
#
# # 获取评标办法前附表的字典结果
# evaluation_standards_res = combine_evaluation_standards(truncate1)
# # 从结果中提取"商务标"和"技术标"
# technical_standards = evaluation_standards_res.get("技术标", {})
# commercial_standards = evaluation_standards_res.get("商务标", {})
# logger.info("商务标和技术标 done")
# # 返回技术标和商务标
# return technical_standards, commercial_standards
def fetch_evaluation_standards(truncate1): # 评标办法前附表
logger.info("starting商务标技术标...")
evaluation_standards_res = combine_evaluation_standards(truncate1)
@ -219,7 +231,9 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
# processed_data['knowledge_name'], processed_data['truncate0_jsonpath'],
# processed_data['clause_path'], processed_data['input_file_path'],
# processed_data['output_folder']),
# 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate_files'][1]),
# # 'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate_files'][1]),
# 'technical_standards': executor.submit(lambda: fetch_evaluation_standards(processed_data['truncate_files'][1])[0]), # 技术标
# 'commercial_standards': executor.submit(lambda: fetch_evaluation_standards(processed_data['truncate_files'][1])[1]), # 商务标
# 'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
# output_folder, processed_data['truncate0_jsonpath'],
# processed_data['clause_path'], processed_data['truncate_files'][4]),
@ -252,6 +266,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):
# # 删除知识索引
# deleteKnowledge(processed_data['knowledge_index'])
#TODO:流式输出要改商务标技术标整合
if __name__ == "__main__":
output_folder = "flask_app/static/output/zytest1"

View File

@ -32,9 +32,9 @@ if __name__ == "__main__":
input_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\4a611a66-0dac-4daf-b365-c6167c5b8c8d\\ztbfile.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\4a611a66-0dac-4daf-b365-c6167c5b8c8d"
# truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf")
truncate2=os.path.join(output_folder,"ztbfile_qualification.pdf")
knowledge_name="zbtest20"
truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf")
truncate1=os.path.join(output_folder,"ztbfile_qualification.pdf")
truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf")
clause_path = convert_clause_to_json(truncate2, output_folder)
truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")

View File

@ -110,13 +110,17 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
for question, response in results2:
cleaned_res = clean_json_string(response)
res_list.append(cleaned_res) # 都是问资格评审表得出的
if res_list:
merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
consortium_dict = get_consortium_dict(knowledge_name)
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict)
return updated_qualify_json
else:
print("资格评审: 无法获取大模型结果,返回空值")
return {"资格评审": ""}
else:
print("资格评审: type2")
return get_all_dict(knowledge_name)
return get_all_dict(knowledge_name) or {"资格评审": ""}
else:
print("资格评审: type3")
@ -128,13 +132,13 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
final_qualify_json = add_keys_to_json(new_non_matching_json, consortium_dict)
return final_qualify_json
else:
return new_non_matching_json
return new_non_matching_json or {"资格评审": ""}
elif matching_keys_list and truncate3=="": #这种情况是评分办法前附表中有要求,但是没有正确截取到'资格审查表'
print("资格评审: type4")
final_qualification=get_all_dict(knowledge_name)
final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
return final_qualify_json
return final_qualify_json or {"资格评审": ""}
else: #大多数情况
print("资格评审: type5")
user_querys = generate_qual_question(matching_keys_list) # 生成提问->‘附件:资格审查’
@ -143,6 +147,7 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
res_list = []
if not results2:
print("资格评审: 调用大模型未成功获取资格评审文件中的要求!")
return {"资格评审": ""}
else:
# 打印结果
for question, response in results2:
@ -152,7 +157,7 @@ def process_qualification(qualification_review,truncate3,knowledge_name):
consortium_dict=get_consortium_dict(knowledge_name)
updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict) # 合并字典
final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict)
return final_qualify_json
return final_qualify_json or {"资格评审": ""}
if __name__ == "__main__":
# qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年2018 年、2019 年、2020 年)完', '类似项目业绩': '投标人近 5 年2017 年至今须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩,同时提供中标通知书及监理服务合同,项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师:至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'}

View File

@ -1,3 +1,108 @@
from flask_app.main.json_utils import nest_json_under_key
res=nest_json_under_key("","资格")
print(res)
import json
import re
#这个字典可能有嵌套,你需要遍历里面的键名,对键名作判断,而不是键值,具体是这样的:如果处于同一层级的键的数量>1并且键名全由数字或点号组成。那么就将这些序号键名全部删除重新组织成一个字典格式的数据你可以考虑用字符串列表来保持部分平级的数据
#对于同级的键,如果数量>1且键名都统一那么将键名去掉用列表保持它们的键值
#对于同一个字典中,可能存在若干键值对,若它们的键值都是""或者"/" 你就将它们的键值删去,它们的键名用字符串列表保存
def is_numeric_key(key):
# 这个正则表达式匹配由数字、点、括号中的数字或单个字母(小写或大写)组成的字符串,
# 字母后跟数字,或数字后跟字母,单个字母后跟点,但不能是字母-数字-字母的组合
pattern = r'^[\d.]+$|^\(\d+\)$|^\d+$|^[a-zA-Z]$|^[a-zA-Z]\d+$|^\d+[a-zA-Z]$|^[a-zA-Z]\.$'
return re.match(pattern, key) is not None
#TODO:如果键值中存在数字就不行
#zbtest20也有问题
def contains_number_or_index(key, value):
# 判断值是否是数字或数字字符串
is_number = isinstance(value, (int, float)) or (isinstance(value, str) and value.isdigit())
# 判断键是否包含 "序号"
contains_index = '序号' in key
# 判断值中是否包含数字
contains_digit = isinstance(value, str) and re.search(r'\d+', value)
# 判断值中是否包含中文字符
contains_chinese = isinstance(value, str) and re.search(r'[\u4e00-\u9fff]', value)
# 如果值中包含数字但也有中文字符,则保留(返回 False
if contains_digit and contains_chinese:
return False
# 如果值是数字或包含数字,且不包含中文字符,或者键包含 "序号",返回 True
return is_number or contains_index or contains_digit
def preprocess_dict(data):
if isinstance(data, dict):
if len(data) > 1:
# 检查是否所有值都是 "" 或 "/"
if all(v == "" or v == "/" for v in data.values()):
return list(data.keys())
else:
processed = {}
for k, v in data.items():
if not contains_number_or_index(k, v):
processed_v = preprocess_dict(v)
if processed_v != "": # 只添加非空值
processed[k] = processed_v
return processed
else:
return {k: preprocess_dict(v) for k, v in data.items()}
elif isinstance(data, list):
return [preprocess_dict(item) for item in data]
else:
return data
def process_dict(data):
if not isinstance(data, dict):
return data
result = {}
numeric_keys = []
non_numeric_keys = {}
for key, value in data.items():
if is_numeric_key(key):
numeric_keys.append((key, value))
else:
non_numeric_keys[key] = value
# 处理数字键,不再要求数量>1
if numeric_keys:
result['items'] = [process_dict(item[1]) for item in sorted(numeric_keys)]
# 处理非数字键
for key, value in non_numeric_keys.items():
if isinstance(value, list) and len(value) > 1 and all(isinstance(item, dict) and len(item) == 1 for item in value):
common_key = next(iter(value[0].keys()))
if all(common_key in item and len(item) == 1 for item in value):
result[key] = [process_dict(item[common_key]) for item in value]
else:
result[key] = [process_dict(item) for item in value]
else:
result[key] = process_dict(value)
# 如果结果只包含'items'键,直接返回其值
if len(result) == 1 and 'items' in result:
return result['items']
return result
# 测试代码
input_data = {
"符合性审查": {
"说明": "评标委员会应当对符合资格的投标人的投标文件进行符合性审查,以确定其是否满足招标文件的实质性要求。",
"审查标准": [
{
"序号": 9,
"内容": "未按要求提供加盖公章及签字(签章)的;"
},
{
"序号": 1,
"内容": "a39"
},
{
"序号": 2,
"内容": "依据财库[2019]9号文的规定招标文件采购清单中为“节能产品”的货物未提供国家确定的认证机构出具的节能产品认证证书的"
},
]
}
}
pred=preprocess_dict(input_data)
print(json.dumps(pred, ensure_ascii=False, indent=4))
# processed_data = process_dict(pred)
# print(json.dumps(processed_data, ensure_ascii=False, indent=4))

View File

@ -72,9 +72,8 @@ def generate_queries(truncate_file, required_keys):
return queries
def get_business_requirements(truncate_file):
def get_business_requirements(truncate_file,file_id):
required_keys = ["商务要求", "服务要求", "其他要求"]
file_id = upload_file(truncate_file)
queries = generate_queries(truncate_file, required_keys)
results = multi_threading(queries, "", file_id, 2)
business_requirements = [res for _, res in results] if results else []
@ -83,9 +82,11 @@ def get_business_requirements(truncate_file):
final_res = combine_json_results(business_requirements)
final_res.update({key: final_res.get(key, "") for key in required_keys})
return json.dumps(final_res, ensure_ascii=False, indent=4)
return final_res
if __name__ == "__main__":
truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
print(get_business_requirements(truncate_file))
file_id = upload_file(truncate_file)
res=get_business_requirements(truncate_file,file_id)
print(json.dumps(res, ensure_ascii=False, indent=4))

View File

@ -97,8 +97,7 @@ def get_technical_requirements(file_id):
final_res = postprocess(cleaned_res)
print("更新后的采购需求处理完成.")
# 输出最终的 JSON 字符串
json_string = json.dumps(final_res, ensure_ascii=False, indent=4)
return json_string
return final_res
def test_all_files_in_folder(input_folder, output_folder):
# 确保输出文件夹存在
@ -128,7 +127,8 @@ if __name__ == "__main__":
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
file_id = upload_file(truncate_file)
res=get_technical_requirements(file_id)
print(res)
json_string = json.dumps(res, ensure_ascii=False, indent=4)
print(json_string)
# input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output3"
# test_all_files_in_folder(input_folder, output_folder)

View File

@ -1,26 +1,40 @@
import concurrent.futures
import json
import time
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
from flask_app.main.format_change import docx2pdf, pdf2docx
from flask_app.货物标.技术要求提取 import get_technical_requirements
from flask_app.main.通义千问long import upload_file
from 商务服务其他要求提取 import get_business_requirements
from flask_app.main.json_utils import nest_json_under_key
#获取采购清单
def fetch_purchasing_list(file_path,output_folder,file_type):
if file_type==1:
docx_path=file_path
pdf_path = docx2pdf(file_path)
elif file_type==2:
pdf_path=file_path
docx_path=pdf2docx(file_path)
else:
print("未传入指定格式的文件!")
return None
truncate_file=truncate_pdf_main(pdf_path,output_folder,1)
def fetch_procurement_reqs(truncate_file):
# 上传文件并获取 file_id
file_id = upload_file(truncate_file)
tech_reqs=get_technical_requirements(file_id)
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
with concurrent.futures.ThreadPoolExecutor() as executor:
# 提交任务给线程池
future_technical = executor.submit(get_technical_requirements, file_id)
time.sleep(1)
future_business = executor.submit(get_business_requirements, truncate_file, file_id)
# 获取并行任务的结果
technical_requirements = future_technical.result()
business_requirements = future_business.result()
# 构建最终的嵌套结构,确保四个键平级
procurement_reqs = {
"采购要求": {
"技术要求": technical_requirements.get("技术要求", {}),
"商务要求": business_requirements.get("商务要求", {}),
"服务要求": business_requirements.get("服务要求", {}),
"其他要求": business_requirements.get("其他要求", {})
}
}
return procurement_reqs
if __name__ == "__main__":
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\磋商文件.doc"
fetch_purchasing_list(file_path,output_folder,1)
file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_procurement.pdf"
res=fetch_procurement_reqs(file_path)
print(json.dumps(res, ensure_ascii=False, indent=4))

View File

@ -0,0 +1,13 @@
from flask_app.main.format_change import docx2pdf, pdf2docx
from flask_app.货物标.货物标截取pdf import truncate_pdf_main
def main_processing(output_folder,file_path,file_type, unique_id):
if file_type==1:
docx_path=file_path
pdf_path = docx2pdf(file_path)
elif file_type==2:
pdf_path=file_path
docx_path=pdf2docx(file_path)
else:
print("未传入指定格式的文件!")
return None
truncate_file=truncate_pdf_main(pdf_path,output_folder,1)

View File

@ -0,0 +1,24 @@
import json
from flask_app.main.基础信息整合 import combine_basic_info
from flask_app.main.通义千问long import qianwen_long,upload_file
from flask_app.main.多线程提问 import multi_threading
from flask_app.main.json_utils import combine_json_results
def qualification_review(truncate_file):
file_id=upload_file(truncate_file)
user_query1="该招标文件中规定的资格性审查标准是怎样的请以json格式给出外层为'资格性审查',对于原文中的序号,你仅需要捕获它们之间的层级关系并根据序号后的内容生成嵌套键值对,若多个内容位于同一层级,你应用字符串列表作为键值保存这些内容,你的回答需删去这些序号,但其余内容要与原文一致,不可擅自总结删减,也不要回答符合性审查的内容。"
user_query2="该招标文件中规定的符合性审查标准是怎样的请以json格式给出外层为'符合性审查',你的回答要与原文一致,不可擅自总结删减,也不要回答资格性审查的内容。"
user_query=[]
user_query.append(user_query1)
user_query.append(user_query2)
results=multi_threading(user_query,"",file_id,2)
result_list=[]
for question, response in results:
result_list.append(response)
combined_res = combine_json_results(result_list) # 整合基础信息核心代码
return combined_res
if __name__ == "__main__":
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output3\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_qualification1.pdf"
res=qualification_review(truncate_file)
print(json.dumps(res,ensure_ascii=False, indent=4))