zbparse/flask_app/main/投标人须知正文提取指定内容.py

121 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
# 定义查找与目标值匹配的键的函数
def find_keys_by_value(target_value, json_data):
matched_keys = [k for k, v in json_data.items() if v == target_value]
if not matched_keys:
matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)]
return matched_keys
# 定义查找以特定前缀开始的键的函数
def find_keys_with_prefix(key_prefix, json_data):
subheadings = [k for k in json_data if k.startswith(key_prefix)]
return subheadings
# 从文件中读取JSON数据并提取特定内容
def extract_json(data, target_values):
results = {}
for target_value in target_values:
matched_keys = find_keys_by_value(target_value, data)
for key in matched_keys:
key_and_subheadings = find_keys_with_prefix(key, data)
for subkey in key_and_subheadings:
if "." in subkey:
parent_key = subkey.rsplit('.', 1)[0]
top_level_key = parent_key.split('.')[0] + '.'
# 特别处理定标相关的顶级键,确保不会重复添加其他键
if target_value == "定标" and top_level_key not in results:
results[top_level_key] = "定标"
# 添加或更新父级键
if parent_key not in results:
if parent_key in data:
results[parent_key] = data[parent_key]
# 添加当前键
results[subkey] = data[subkey]
return results
# 转换结构化的JSON数据
def transform_json(data):
result = {}
temp = {0: result}
for key, value in data.items():
match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key)
if match:
levels = [int(l) for l in match.groups() if l is not None]
parent = temp[len(levels) - 1]
if len(levels) == len(match.groups()):
if isinstance(parent, list):
parent.append(value)
else:
# 对于没有 \n 的情况,使用首个空格分割的词作为键
parent[value.split()[0]] = value
else:
new_key = value.split()[0]
if '\n' in value and len(levels) == 2:
# 处理换行情况并分割键和值
new_key, new_value = value.split('\n', 1)
new_key = new_key.strip()
new_value = new_value.strip()
# 确保父级是字典
if isinstance(parent, list):
if len(parent) == 0 or not isinstance(parent[-1], dict):
parent.append({})
parent[-1][new_key] = new_value
else:
parent[new_key] = new_value
else:
if isinstance(parent, list):
if len(parent) == 0 or not isinstance(parent[-1], dict):
parent.append({})
parent = parent[-1]
if new_key not in parent:
parent[new_key] = []
temp[len(levels)] = parent[new_key]
# 修改函数以移除只有一个元素的列表和空列表
def remove_single_item_lists(node):
if isinstance(node, dict):
for key in list(node.keys()):
node[key] = remove_single_item_lists(node[key])
if isinstance(node[key], list) and not node[key]:
node[key] = "" # 如果列表为空,转换为空字符串
elif isinstance(node, list) and len(node) == 1:
return remove_single_item_lists(node[0])
return node
return remove_single_item_lists(result)
# 读取JSON数据提取内容转换结构并打印结果
def extract_from_notice(clause_path, type):
if type == 1:
target_values = ["投标文件", "投标"]
elif type == 2:
target_values = ["开标", "评标", "定标"]
elif type == 3:
target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
else:
raise ValueError("Invalid type specified. Use 1 for '开标, 评标, 定标' or 2 for '投标文件, 投标'.")
with open(clause_path, 'r', encoding='utf-8') as file:
data = json.load(file)
extracted_data = extract_json(data, target_values) # 读取json
transformed_data = transform_json(extracted_data)
return transformed_data
# 假设原始数据文件路径
if __name__ == "__main__":
file_path = 'clause2.json'
try:
res = extract_from_notice(file_path, 3) # 可以改变此处的 type 参数测试不同的场景
print(res)
except ValueError as e:
print(e)