10.25切分改为多线程

This commit is contained in:
zy123 2024-10-25 17:50:20 +08:00
parent 6ecc370d27
commit e73125f097
7 changed files with 115 additions and 41 deletions

View File

@ -13,6 +13,11 @@ def format_chinese_date(date_str):
str: 格式化后的日期字符串例如 "2019-07-18 09:30:00"
如果格式错误返回 None 并打印错误信息
"""
# 检查输入类型
if not isinstance(date_str, str):
print(f"输入类型错误: 期望字符串类型,实际类型为 {type(date_str)}")
return None
# print("------------")
# print(f"原始输入: {date_str}")
@ -104,6 +109,7 @@ def format_chinese_date(date_str):
if __name__ == "__main__":
input_dates = [
# 完整的日期和时间
["www"],
"2019年7月18日0930",
"20 19 年7 月18日 09 30整北京时间",
"2020年02月05日12时30分45秒",

View File

@ -97,18 +97,22 @@ def inner_post_processing(base_info):
str: 转换后的字符串
"""
if isinstance(value, list):
concatenated = []
for item in value:
if isinstance(item, dict):
pairs = []
for k, v in item.items():
if v not in ["未知", ""]:
pairs.append(f"{k}:{v}")
concatenated.append(" ".join(pairs))
else:
# 如果列表中有非字典项,直接转换为字符串
concatenated.append(str(item))
return " ".join(concatenated)
if all(isinstance(item, str) for item in value):
# 如果列表中的所有元素都是字符串,则用两个空格连接
return " ".join(item for item in value if item not in ["未知", ""])
else:
concatenated = []
for item in value:
if isinstance(item, dict):
pairs = []
for k, v in item.items():
if v not in ["未知", ""]:
pairs.append(f"{k}:{v}")
concatenated.append(" ".join(pairs))
else:
# 如果列表中有非字典且非字符串的项,直接转换为字符串
concatenated.append(str(item))
return " ".join(concatenated)
elif isinstance(value, dict):
pairs = []
for k, v in value.items():
@ -201,7 +205,7 @@ def inner_post_processing(base_info):
return extracted_info
def outer_post_processing(combined_data, includes):
def outer_post_processing(combined_data, includes,good_list):
"""
外层处理函数调用内层 post_processing 处理 '基础信息'并构建 processed_data
额外提取 '采购要求' 下的 '技术要求' 内容
@ -259,7 +263,7 @@ if __name__ == "__main__":
combined_data={
"基础信息": {
"招标人/代理信息": {
"招标人": "广水市公路管理局",
"招标人": ["广水市公路管理局","sss"],
"招标人联系方式": {
"名称": "广水市公路管理局",
"联系电话": "17362698785",
@ -695,9 +699,9 @@ if __name__ == "__main__":
"其他要求": ""
},
"关键时间/内容": {
"投标文件递交截止日期": "2021年月日点分",
"投标文件递交截止日期": "2021年121日点分",
"投标文件递交地点": "广水市公共资源交易中心五楼号开标室",
"开标时间": "未知",
"开标时间": "www",
"开标地点": "广水市公共资源交易中心五楼号开标室",
"澄清招标文件的截止时间": "未知",
"投标有效期": "提交投标文件截止之日起 60日历日",
@ -952,5 +956,5 @@ if __name__ == "__main__":
includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
res1,res2,res3=outer_post_processing(combined_data,includes)
# print(json.dumps(res2,ensure_ascii=False,indent=4))
print(json.dumps(res2,ensure_ascii=False,indent=4))
print(json.dumps(res3,ensure_ascii=False,indent=4))

View File

@ -11,7 +11,7 @@ from flask_app.main.download import download_file
from flask_app.general.post_processing import outer_post_processing
from flask_app.main.工程标解析main import engineering_bid_main
from flask_app.货物标.货物标解析main import goods_bid_main
from flask_app.货物标.技术要求提取 import get_technical_requirements_main
from flask_app.货物标.技术参数要求提取 import get_technical_requirements_main
app = Flask(__name__)
class CSTFormatter(logging.Formatter):
@ -337,6 +337,7 @@ def process_and_stream(file_url, zb_type):
logger.info("本地文件路径: " + downloaded_filepath)
combined_data = {}
good_list = None
# 根据zb_type选择调用的处理函数
processing_functions = {
@ -358,6 +359,11 @@ def process_and_stream(file_url, zb_type):
logger.error(f"Data received: {data}")
continue # Skip data if JSON parsing fails
if 'good_list' in parsed_data:
good_list = parsed_data['good_list']
logger.info("Collected good_list from the processing function.")
continue # Skip yielding good_list to the client
# 遍历 parsed_data 只提取内层内容进行合并
for outer_key, inner_dict in parsed_data.items():
if isinstance(inner_dict, dict):
@ -379,7 +385,7 @@ def process_and_stream(file_url, zb_type):
output_json_path = os.path.join(output_folder, 'final_result.json')
extracted_info_path=os.path.join(output_folder, 'extracted_result.json')
includes = ["基础信息", "资格审查", "商务评分", "技术评分", "无效标与废标项", "投标文件要求", "开评定标流程"]
final_result, extracted_info,procurement_reqs = outer_post_processing(combined_data, includes)
final_result, extracted_info,procurement_reqs = outer_post_processing(combined_data, includes,good_list)
logger.info(f"Procurement requirements extracted: {json.dumps(procurement_reqs, ensure_ascii=False, indent=4)}") # 添加日志记录
#采购需求
@ -404,7 +410,7 @@ def process_and_stream(file_url, zb_type):
except IOError as e:
logger.error(f"保存JSON文件时出错: {e}")
#取的数据
#取的数据
extracted_info_response = {
'message': 'extracted_info',
'filename': os.path.basename(downloaded_filepath),

View File

@ -8,37 +8,60 @@ from flask_app.general.json_utils import clean_json_string, combine_json_results
from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main
def generate_key_paths(data, parent_key=''):
"""
生成嵌套字典中的键路径并提取最内层的键名
参数:
data (dict): 输入的字典数据
parent_key (str): 上级键路径用于递归调用
返回:
tuple: 包含键路径列表和最内层键名列表的元组
(key_paths, good_list)
"""
key_paths = []
good_list = []
no_keys_added = True # 默认假设没有添加任何键
for key, value in data.items():
# 构建当前的键路径
current_key = f"{parent_key}.{key}" if parent_key else key
if isinstance(value, dict):
if value:
# 递归调用,并更新 no_keys_added 状态
sub_paths, sub_no_keys_added = generate_key_paths(value, current_key)
key_paths.extend(sub_paths)
# 递归调用,并获取子路径和子 good_list
sub_key_paths, sub_good_list, sub_no_keys_added = generate_key_paths(value, current_key)
key_paths.extend(sub_key_paths)
good_list.extend(sub_good_list)
no_keys_added = no_keys_added and sub_no_keys_added
else:
# 空字典也视为未添加键
# 空字典视为叶子节点
key_paths.append(current_key)
good_list.append(key)
no_keys_added = False
elif isinstance(value, list):
# 列表只在非空时视为添加了键
if value:
# 非空列表视为叶子节点
key_paths.append(current_key)
good_list.append(key)
no_keys_added = False
else:
# 空列表也视为叶子节点(根据需求可以调整)
key_paths.append(current_key)
good_list.append(key)
no_keys_added = False
elif value == "未知" or value == "" or value == "/":
# 处理为"未知"或空字符串
elif value in {"未知", "", "/"}:
# 特定值视为叶子节点
key_paths.append(current_key)
good_list.append(key)
no_keys_added = False
else:
# 值不是字典、列表、"未知"或空字符串,也不添加到键路径
no_keys_added = True and no_keys_added # 只保持 True 如果之前所有键都未添加
# 其他情况不视为叶子节点
key_paths.append(current_key)
good_list.append(key)
no_keys_added = False
return key_paths, no_keys_added
return key_paths, good_list, no_keys_added
def combine_and_update_results(original_data, updates):
def recursive_update(data, key, value):
@ -65,6 +88,7 @@ def postprocess(data):
# 递归处理顶层数据
return {key: convert_dict(val) if isinstance(val, dict) else val for key, val in data.items()}
def get_technical_requirements(file_id):
user_query1 = """
这是一份货物标中采购要求部分的内容请告诉我需要采购的系统或货物如果有采购清单请直接根据清单上的货物名称给出结果若没有采购清单你要从文中摘取需要采购的系统或货物采购需求中可能包含层次关系如某大系统中包含若干货物那么需要用嵌套键值对表示这种关系请以json格式返回最外层键名为'采购需求'嵌套键名为对应的系统名称或货物名称需与原文保持一致无需给出采购数量和单位如有未知内容在对应键值处填'未知'以下为示例输出
@ -82,9 +106,9 @@ def get_technical_requirements(file_id):
"""
res = qianwen_long(file_id, user_query1)
cleaned_res = clean_json_string(res)
print(res)
keys_list ,no_keys_added= generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单
cleaned_res = clean_json_string(res) #转字典
# print(res)
keys_list,good_list,no_keys_added= generate_key_paths(cleaned_res['采购需求']) # 提取需要采购的货物清单
if '采购需求' in cleaned_res:
cleaned_res['技术要求'] = cleaned_res.pop('采购需求')
if no_keys_added:
@ -111,6 +135,7 @@ def get_technical_requirements(file_id):
# 更新原始采购需求字典
combine_and_update_results(cleaned_res['技术要求'], technical_requirements_combined_res)
final_res = postprocess(cleaned_res)
final_res['技术要求']["货物列表"] = good_list #添加需要采购的货物
# 输出最终的 JSON 字符串
return final_res
@ -148,11 +173,12 @@ def get_technical_requirements_main(file_path,output_folder):
else:
return final_res
if __name__ == "__main__":
# truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\zboutpub\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_procurement.pdf"
truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\217754b7-3efd-41b2-806b-0b5b1bc98904\\ztbfile.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\fsdownload\\217754b7-3efd-41b2-806b-0b5b1bc98904\\tmp"
truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件定稿三次_procurement.pdf"
# truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\217754b7-3efd-41b2-806b-0b5b1bc98904\\ztbfile.pdf"
output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
file_id = upload_file(truncate_file)
res=get_technical_requirements_main(truncate_file,output_folder)
res=get_technical_requirements(file_id)
# res=get_technical_requirements_main(truncate_file,output_folder)
json_string = json.dumps(res, ensure_ascii=False, indent=4)
print(json_string)
# input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"

View File

@ -1,7 +1,7 @@
import concurrent.futures
import json
import time
from flask_app.货物标.技术要求提取 import get_technical_requirements
from flask_app.货物标.技术参数要求提取 import get_technical_requirements
from flask_app.general.通义千问long import upload_file
from flask_app.货物标.商务服务其他要求提取 import get_business_requirements
@ -38,7 +38,8 @@ def fetch_procurement_reqs(truncate_file):
"技术要求": technical_requirements.get("技术要求", {}),
"商务要求": business_requirements.get("商务要求", {}),
"服务要求": business_requirements.get("服务要求", {}),
"其他要求": business_requirements.get("其他要求", {})
"其他要求": business_requirements.get("其他要求", {}),
"货物列表":business_requirements.get("货物列表",{})
}
return procurement_reqs

View File

@ -81,8 +81,9 @@ def preprocess_files(output_folder, file_path, file_type):
def fetch_project_basic_info(merged_baseinfo_path, procurement_file_path): # 投标人须知前附表
logger.info("starting基础信息...")
basic_res = combine_basic_info(merged_baseinfo_path, procurement_file_path)
base_info,good_list=post_process_baseinfo(basic_res)
logger.info("基础信息done")
return basic_res
return base_info,good_list
def fetch_qualification_review(output_folder, qualification_path, notice_path,merged_baseinfo_path): # 资格审查
@ -130,6 +131,30 @@ def fetch_bid_opening(clause_path):
return {"开评定标流程": fetch_bid_opening_json}
def post_process_baseinfo(base_info):
"""
'base_info' 任务完成后执行的函数
确保在缺少某些键时返回 good_list=[]
参数
- base_info (dict): 原始的 base_info 数据
返回
- tuple: (处理后的 base_info, good_list)
"""
try:
# 尝试提取 '货物列表',若中间某个键不存在,返回 good_list=[]
print(json.dumps(base_info,ensure_ascii=False,indent=4))
procurement_reqs = base_info.get('采购要求', {})
technical_requirements = procurement_reqs.get('技术要求', {})
good_list = technical_requirements.pop('货物列表', []) # 如果 '货物列表' 不存在,返回 []
logger.info(f"Extracted good_list: {good_list}")
return base_info, good_list
except Exception as e:
logger.error(f"Error in post_process_baseinfo: {e}")
return base_info, [] # 返回空列表
def goods_bid_main(output_folder, file_path, file_type, unique_id):
global logger
logger = get_global_logger(unique_id)
@ -162,6 +187,10 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
key = next(k for k, v in futures.items() if v == future)
try:
result = future.result()
if key == 'base_info':
base_info, good_list = result
collected_good_list = good_list # Store good_list for later use
yield json.dumps({'base_info': transform_json_values(base_info)}, ensure_ascii=False)
# 如果是 evaluation_standards拆分技术标和商务标
if key == 'evaluation_standards':
technical_standards = result["technical_standards"]
@ -177,10 +206,12 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
except Exception as exc:
logger.error(f"Error processing {key}: {exc}")
yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
if collected_good_list is not None:
yield json.dumps({'good_list': transform_json_values(collected_good_list)}, ensure_ascii=False)
#TODO:目前的无效标这块的键值都删去空格了,所有的键名都删去空格
#广水市 2022 年义务教育学校多媒体补充采购项目 资格审查有问题
#TODO:区分output目录 陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_tobidders_notice_part2.pdf提取有问题 #一包二包问题 107国道 #日期格式统一
#good_list 金额 截取上下文
if __name__ == "__main__":
import time