196 lines
7.8 KiB
Python
196 lines
7.8 KiB
Python
import re
|
||
|
||
|
||
def inner_post_processing(base_info):
|
||
"""
|
||
处理 '基础信息' 部分,提取所需字段。
|
||
|
||
参数:
|
||
base_info (dict): 包含 '基础信息' 的字典。
|
||
|
||
返回:
|
||
dict: 提取的信息字典 extracted_info。
|
||
"""
|
||
# 初始化提取的信息字典
|
||
extracted_info = {}
|
||
|
||
# 定义一个辅助函数用于获取嵌套字典中的值
|
||
def get_nested(dic, keys, default=None):
|
||
for key in keys:
|
||
if isinstance(dic, dict):
|
||
dic = dic.get(key, default)
|
||
else:
|
||
return default
|
||
return dic
|
||
|
||
# 定义一个辅助函数用于递归查找包含特定子字符串的键
|
||
def find_keys_containing(dic, substring):
|
||
found_values = []
|
||
if isinstance(dic, dict):
|
||
for key, value in dic.items():
|
||
if substring in key:
|
||
found_values.append(value)
|
||
if isinstance(value, dict):
|
||
found_values.extend(find_keys_containing(value, substring))
|
||
elif isinstance(value, list):
|
||
for item in value:
|
||
if isinstance(item, dict):
|
||
found_values.extend(find_keys_containing(item, substring))
|
||
return found_values
|
||
|
||
# 定义一个辅助函数用于根据候选键列表提取值(部分匹配)
|
||
def extract_field(contact_info, candidate_keys):
|
||
for candidate in candidate_keys:
|
||
for key, value in contact_info.items():
|
||
if candidate in key and value not in ["未知", ""]:
|
||
return value
|
||
return ""
|
||
|
||
# 定义一个辅助函数用于提取 '投标保证金'
|
||
def extract_bid_bond(guarantee_info):
|
||
# 定义投标保证金的候选键
|
||
bid_bond_candidates = ["投标保证金", "磋商保证金"]
|
||
|
||
# 第一步:查找包含 "投标保证金" 或 "磋商保证金" 的键
|
||
for candidate in bid_bond_candidates:
|
||
for key, value in guarantee_info.items():
|
||
if candidate in key:
|
||
if isinstance(value, dict):
|
||
# 在嵌套字典中查找包含 "金额" 的键
|
||
for sub_key, sub_value in value.items():
|
||
if "金额" in sub_key and sub_value not in ["未知", ""]:
|
||
return sub_value
|
||
elif isinstance(value, str):
|
||
if "金额" in key and value not in ["未知", ""]:
|
||
return value
|
||
else:
|
||
# 如果 value 既不是 dict 也不是 str,忽略
|
||
continue
|
||
|
||
# 第二步:如果没有找到包含 "金额" 的键,尝试在所有键值中查找符合模式的值
|
||
amount_pattern = re.compile(r'(?:\d{1,3}(?:[,,]\d{3})*(?:\.\d+)?|\d+(?:\.\d+)?|[\u4e00-\u9fff]+(?:\.\d+)?)\s*(?:元|万元)')
|
||
for key, value in guarantee_info.items():
|
||
if isinstance(value, str):
|
||
match = amount_pattern.search(value)
|
||
if match:
|
||
return match.group()
|
||
elif isinstance(value, dict):
|
||
# 递归查找嵌套字典中的金额
|
||
found_amount = extract_bid_bond(value)
|
||
if found_amount:
|
||
return found_amount
|
||
# 如果都没有找到,则返回空字符串
|
||
return ""
|
||
|
||
# 定义所需字段的映射关系,暂时不包含'联系人'和'联系电话'以及'招标项目地点'
|
||
mapping = {
|
||
"代理机构名称": [["招标人/代理信息", "招标代理机构"]],
|
||
"招标项目名称": [["项目信息", "项目名称"], ["项目信息", "工程名称"]],
|
||
"招标项目编号": [["项目信息", "项目编号"], ["项目信息", "招标编号"]],
|
||
"开标时间": [["关键时间/内容", "开标时间"]],
|
||
"报名截止日期": [["关键时间/内容", "投标文件递交截止日期"]],
|
||
"招标项目预算": [["项目信息", "招标控制价"]],
|
||
"招标单位名称": [["招标人/代理信息", "招标人"]],
|
||
"招标公告地址": [["关键时间/内容", "信息公示媒介"], ["关键时间/内容", "评标结果公示媒介"]]
|
||
}
|
||
|
||
# 提取并映射字段
|
||
for new_key, paths in mapping.items():
|
||
value = None
|
||
for path in paths:
|
||
value = get_nested(base_info, path)
|
||
if value:
|
||
break
|
||
extracted_info[new_key] = value if value else ""
|
||
|
||
# 特殊处理 '招标项目地点'
|
||
# 在 '项目信息' 下查找包含 "地点" 的键
|
||
project_info = base_info.get("项目信息", {})
|
||
location_candidates = find_keys_containing(project_info, "地点")
|
||
if location_candidates:
|
||
# 选择第一个找到的地点
|
||
extracted_info["招标项目地点"] = location_candidates[0]
|
||
else:
|
||
extracted_info["招标项目地点"] = ""
|
||
|
||
# 特殊处理 '联系人' 和 '联系电话'
|
||
# 提取 '项目联系方式'
|
||
project_contact = get_nested(base_info, ["招标人/代理信息", "项目联系方式"], {})
|
||
|
||
# 提取 '招标人联系方式'
|
||
bidder_contact = get_nested(base_info, ["招标人/代理信息", "招标人联系方式"], {})
|
||
|
||
# 定义候选键列表,按优先级排序
|
||
name_candidates = ["名称", "联系人", "招标"]
|
||
phone_candidates = ["电话", "手机", "联系方式"]
|
||
|
||
# 提取 '联系人'
|
||
contact_names = [project_contact, bidder_contact]
|
||
contact_name = ""
|
||
for contact in contact_names:
|
||
extracted_name = extract_field(contact, name_candidates)
|
||
if extracted_name:
|
||
contact_name = extracted_name
|
||
break
|
||
extracted_info["联系人"] = contact_name
|
||
|
||
# 提取 '联系电话'
|
||
contact_phones = [project_contact, bidder_contact]
|
||
contact_phone = ""
|
||
for contact in contact_phones:
|
||
extracted_phone = extract_field(contact, phone_candidates)
|
||
if extracted_phone:
|
||
contact_phone = extracted_phone
|
||
break
|
||
extracted_info["联系电话"] = contact_phone
|
||
|
||
# 特殊处理 '投标保证金'
|
||
# 提取 '保证金相关'
|
||
guarantee_info = get_nested(base_info, ["保证金相关"], {})
|
||
extracted_info["投标保证金"] = extract_bid_bond(guarantee_info)
|
||
|
||
return extracted_info
|
||
|
||
def outer_post_processing(combined_data, includes):
|
||
"""
|
||
外层处理函数,调用内层 post_processing 处理 '基础信息',并构建 processed_data。
|
||
|
||
参数:
|
||
combined_data (dict): 原始合并数据。
|
||
includes (list): 需要包含的键列表。
|
||
|
||
返回:
|
||
tuple: (processed_data, extracted_info)
|
||
"""
|
||
# 初始化结果字典,预设'其他'分类为空字典
|
||
processed_data = {"其他": {}}
|
||
|
||
# 初始化提取的信息字典
|
||
extracted_info = {}
|
||
|
||
# 检查 '基础信息' 是否在 includes 中
|
||
if "基础信息" in includes:
|
||
base_info = combined_data.get("基础信息", {})
|
||
# 调用内层 post_processing 处理 '基础信息'
|
||
extracted_info = inner_post_processing(base_info)
|
||
# 将 '基础信息' 保留在处理后的数据中
|
||
processed_data["基础信息"] = base_info
|
||
|
||
# 遍历原始字典的每一个键值对
|
||
for key, value in combined_data.items():
|
||
if key in includes:
|
||
if key == "基础信息":
|
||
# 已经处理 '基础信息',无需再次添加
|
||
continue
|
||
else:
|
||
# 直接保留包含在 includes 列表中的键值对
|
||
processed_data[key] = value
|
||
else:
|
||
# 将不在 includes 列表中的键值对加入到 '其他' 分类中
|
||
processed_data["其他"][key] = value
|
||
|
||
# 如果 '其他' 分类没有任何内容,可以选择删除这个键
|
||
if not processed_data["其他"]:
|
||
del processed_data["其他"]
|
||
|
||
return processed_data, extracted_info |