zbparse/flask_app/main/投标人须知正文提取指定内容.py
2024-09-25 15:58:27 +08:00

156 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
# 对于每个target_value元素如果有完美匹配json_data中的键那就加入这个完美匹配的键名否则把全部模糊匹配到的键名都加入
def find_keys_by_value(target_value, json_data):
matched_keys = [k for k, v in json_data.items() if v == target_value] #首先检查 JSON 中的每个键值对,如果值完全等于目标值,则将这些键收集起来。
if not matched_keys:
matched_keys = [k for k, v in json_data.items() if isinstance(v, str) and v.startswith(target_value)] #如果没有找到完全匹配的键,它会检查字符串类型的值是否以目标值开头,并收集这些键。
return matched_keys # eg:[3.1,3.1.1,3.1.2,3.2...]
# 定义查找以特定前缀开始的键的函数eg:若match_keys中有3.1那么以3.1为前缀的键都会被找出来如3.1.1 3.1.2...
def find_keys_with_prefix(key_prefix, json_data):
subheadings = [k for k in json_data if k.startswith(key_prefix)]
return subheadings
# 从文件中读取JSON数据并提取特定内容
def extract_json(data, target_values):
results = {}
for target_value in target_values:
matched_keys = find_keys_by_value(target_value, data)
for key in matched_keys:
key_and_subheadings = find_keys_with_prefix(key, data)
for subkey in key_and_subheadings:
if "." in subkey:
parent_key = subkey.rsplit('.', 1)[0]
top_level_key = parent_key.split('.')[0] + '.'
# 特别处理定标相关的顶级键,确保不会重复添加其他键
if top_level_key not in results:
results[top_level_key] = target_value
# 添加或更新父级键
if parent_key not in results:
if parent_key in data:
results[parent_key] = data[parent_key]
# 添加当前键
results[subkey] = data[subkey]
return results
def sort_clean_data_keys(data):
# 预处理:删除键名中的空格
def preprocess_key(key):
return re.sub(r'\s+', '', key)
# 将键转换成由整数构成的元组,作为排序依据
def key_func(key):
return tuple(int(part) for part in re.split(r'\D+', key) if part)
# 创建一个新的字典,键名经过预处理
preprocessed_data = {preprocess_key(key): value for key, value in data.items()}
# 对预处理后的字典键进行排序
sorted_keys = sorted(preprocessed_data.keys(), key=key_func)
# 创建一个新的字典,按照排序后的键添加键值对
sorted_data = {key: preprocessed_data[key] for key in sorted_keys}
return sorted_data
# 转换结构化的JSON数据
def transform_json(data):
result = {}
temp = {0: result} # 初始化根字典
# 首先,创建一个临时字典用于检查是否存在三级标题
has_subkey = {}
for key in data.keys():
parts = key.split('.')
if len(parts) > 2 and parts[1]:
parent_key = parts[0] + '.' + parts[1]
has_subkey[parent_key] = True
for key, value in data.items():
match = re.match(r'(\d+)(?:\.(\d+))?(?:\.(\d+))?', key)
if match:
levels = [int(l) for l in match.groups() if l is not None]
if (len(levels) - 1) in temp:
parent = temp[len(levels) - 1]
else:
print(f"No parent found at level {len(levels) - 1} for key '{key}'. Check the data structure.")
continue
if len(levels) == len(match.groups()):
if isinstance(parent, list):
parent.append(value)
else:
# 对于根级别,使用完整的值作为键
if len(levels) == 1:
parent[value] = {}
temp[len(levels)] = parent[value]
else:
parent[value.split()[0]] = value
else:
new_key = value
if '\n' in value and len(levels) == 2 and f"{levels[0]}.{levels[1]}" not in has_subkey:
new_key, new_value = value.split('\n', 1)
new_key = new_key.strip()
new_value = new_value.strip()
if isinstance(parent, list):
if len(parent) == 0 or not isinstance(parent[-1], dict):
parent.append({})
parent[-1][new_key] = new_value
else:
parent[new_key] = new_value
else:
if isinstance(parent, list):
if len(parent) == 0 or not isinstance(parent[-1], dict):
parent.append({})
parent = parent[-1]
if new_key not in parent:
parent[new_key] = []
temp[len(levels)] = parent[new_key]
def remove_single_item_lists(node):
if isinstance(node, dict):
for key in list(node.keys()):
node[key] = remove_single_item_lists(node[key])
if isinstance(node[key], list) and not node[key]:
node[key] = ""
elif isinstance(node, list) and len(node) == 1:
return remove_single_item_lists(node[0])
return node
return remove_single_item_lists(result)
# 读取JSON数据提取内容转换结构并打印结果
def extract_from_notice(clause_path, type):
if type == 1:
target_values = ["投标","投标文件"]
elif type == 2:
target_values = ["开标", "评标", "定标"]
elif type == 3:
target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
else:
raise ValueError("Invalid type specified. Use 1 for '投标文件, 投标' or 2 for '开标, 评标, 定标'or 3 for '重新招标'")
with open(clause_path, 'r', encoding='utf-8') as file:
data = json.load(file)
extracted_data = extract_json(data, target_values) # 读取json
sorted_data=sort_clean_data_keys(extracted_data) #对键进行排序
transformed_data = transform_json(sorted_data)
return transformed_data
# 假设原始数据文件路径
if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json'
# file_path='C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause1.json'
try:
res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景
res2=json.dumps(res,ensure_ascii=False,indent=4)
print(res2)
except ValueError as e:
print(e)