zbparse/flask_app/main/post_processing.py
2024-10-17 15:33:58 +08:00

196 lines
7.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
def inner_post_processing(base_info):
"""
处理 '基础信息' 部分,提取所需字段。
参数:
base_info (dict): 包含 '基础信息' 的字典。
返回:
dict: 提取的信息字典 extracted_info。
"""
# 初始化提取的信息字典
extracted_info = {}
# 定义一个辅助函数用于获取嵌套字典中的值
def get_nested(dic, keys, default=None):
for key in keys:
if isinstance(dic, dict):
dic = dic.get(key, default)
else:
return default
return dic
# 定义一个辅助函数用于递归查找包含特定子字符串的键
def find_keys_containing(dic, substring):
found_values = []
if isinstance(dic, dict):
for key, value in dic.items():
if substring in key:
found_values.append(value)
if isinstance(value, dict):
found_values.extend(find_keys_containing(value, substring))
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
found_values.extend(find_keys_containing(item, substring))
return found_values
# 定义一个辅助函数用于根据候选键列表提取值(部分匹配)
def extract_field(contact_info, candidate_keys):
for candidate in candidate_keys:
for key, value in contact_info.items():
if candidate in key and value not in ["未知", ""]:
return value
return ""
# 定义一个辅助函数用于提取 '投标保证金'
def extract_bid_bond(guarantee_info):
# 定义投标保证金的候选键
bid_bond_candidates = ["投标保证金", "磋商保证金"]
# 第一步:查找包含 "投标保证金" 或 "磋商保证金" 的键
for candidate in bid_bond_candidates:
for key, value in guarantee_info.items():
if candidate in key:
if isinstance(value, dict):
# 在嵌套字典中查找包含 "金额" 的键
for sub_key, sub_value in value.items():
if "金额" in sub_key and sub_value not in ["未知", ""]:
return sub_value
elif isinstance(value, str):
if "金额" in key and value not in ["未知", ""]:
return value
else:
# 如果 value 既不是 dict 也不是 str忽略
continue
# 第二步:如果没有找到包含 "金额" 的键,尝试在所有键值中查找符合模式的值
amount_pattern = re.compile(r'(?:\d{1,3}(?:[,]\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?|[\u4e00-\u9fff]+(?:\.\d+)?)\s*(?:元|万元)')
for key, value in guarantee_info.items():
if isinstance(value, str):
match = amount_pattern.search(value)
if match:
return match.group()
elif isinstance(value, dict):
# 递归查找嵌套字典中的金额
found_amount = extract_bid_bond(value)
if found_amount:
return found_amount
# 如果都没有找到,则返回空字符串
return ""
# 定义所需字段的映射关系,暂时不包含'联系人'和'联系电话'以及'招标项目地点'
mapping = {
"代理机构名称": [["招标人/代理信息", "招标代理机构"]],
"招标项目名称": [["项目信息", "项目名称"], ["项目信息", "工程名称"]],
"招标项目编号": [["项目信息", "项目编号"], ["项目信息", "招标编号"]],
"开标时间": [["关键时间/内容", "开标时间"]],
"报名截止日期": [["关键时间/内容", "投标文件递交截止日期"]],
"招标项目预算": [["项目信息", "招标控制价"]],
"招标单位名称": [["招标人/代理信息", "招标人"]],
"招标公告地址": [["关键时间/内容", "信息公示媒介"], ["关键时间/内容", "评标结果公示媒介"]]
}
# 提取并映射字段
for new_key, paths in mapping.items():
value = None
for path in paths:
value = get_nested(base_info, path)
if value:
break
extracted_info[new_key] = value if value else ""
# 特殊处理 '招标项目地点'
# 在 '项目信息' 下查找包含 "地点" 的键
project_info = base_info.get("项目信息", {})
location_candidates = find_keys_containing(project_info, "地点")
if location_candidates:
# 选择第一个找到的地点
extracted_info["招标项目地点"] = location_candidates[0]
else:
extracted_info["招标项目地点"] = ""
# 特殊处理 '联系人' 和 '联系电话'
# 提取 '项目联系方式'
project_contact = get_nested(base_info, ["招标人/代理信息", "项目联系方式"], {})
# 提取 '招标人联系方式'
bidder_contact = get_nested(base_info, ["招标人/代理信息", "招标人联系方式"], {})
# 定义候选键列表,按优先级排序
name_candidates = ["名称", "联系人", "招标"]
phone_candidates = ["电话", "手机", "联系方式"]
# 提取 '联系人'
contact_names = [project_contact, bidder_contact]
contact_name = ""
for contact in contact_names:
extracted_name = extract_field(contact, name_candidates)
if extracted_name:
contact_name = extracted_name
break
extracted_info["联系人"] = contact_name
# 提取 '联系电话'
contact_phones = [project_contact, bidder_contact]
contact_phone = ""
for contact in contact_phones:
extracted_phone = extract_field(contact, phone_candidates)
if extracted_phone:
contact_phone = extracted_phone
break
extracted_info["联系电话"] = contact_phone
# 特殊处理 '投标保证金'
# 提取 '保证金相关'
guarantee_info = get_nested(base_info, ["保证金相关"], {})
extracted_info["投标保证金"] = extract_bid_bond(guarantee_info)
return extracted_info
def outer_post_processing(combined_data, includes):
"""
外层处理函数,调用内层 post_processing 处理 '基础信息',并构建 processed_data。
参数:
combined_data (dict): 原始合并数据。
includes (list): 需要包含的键列表。
返回:
tuple: (processed_data, extracted_info)
"""
# 初始化结果字典,预设'其他'分类为空字典
processed_data = {"其他": {}}
# 初始化提取的信息字典
extracted_info = {}
# 检查 '基础信息' 是否在 includes 中
if "基础信息" in includes:
base_info = combined_data.get("基础信息", {})
# 调用内层 post_processing 处理 '基础信息'
extracted_info = inner_post_processing(base_info)
# 将 '基础信息' 保留在处理后的数据中
processed_data["基础信息"] = base_info
# 遍历原始字典的每一个键值对
for key, value in combined_data.items():
if key in includes:
if key == "基础信息":
# 已经处理 '基础信息',无需再次添加
continue
else:
# 直接保留包含在 includes 列表中的键值对
processed_data[key] = value
else:
# 将不在 includes 列表中的键值对加入到 '其他' 分类中
processed_data["其他"][key] = value
# 如果 '其他' 分类没有任何内容,可以选择删除这个键
if not processed_data["其他"]:
del processed_data["其他"]
return processed_data, extracted_info