9.30
This commit is contained in:
parent
99349a9968
commit
e017acc673
@ -18,40 +18,59 @@ def find_keys_with_prefix(key_prefix, json_data):
|
||||
|
||||
|
||||
# 从完整的json文件中读取所需数据,eg:投标、评标
|
||||
# def extract_json(data, target_values):
|
||||
# results = {}
|
||||
#
|
||||
# # 遍历所有目标值
|
||||
# for target_value in target_values:
|
||||
# # 找到所有与目标值匹配的键
|
||||
# matched_keys = find_keys_by_value(target_value, data)
|
||||
#
|
||||
# for key in matched_keys:
|
||||
# # 查找所有以该键为前缀的子键,限制只提取直接子项
|
||||
# key_and_subheadings = find_keys_with_prefix(key, data)
|
||||
#
|
||||
# for subkey in key_and_subheadings:
|
||||
# # 如果子键有多级结构(比如 '7.2.1'),并且是直接子项
|
||||
# if "." in subkey:
|
||||
# parent_key = subkey.rsplit('.', 1)[0]
|
||||
# top_level_key = parent_key.split('.')[0] + '.'
|
||||
#
|
||||
# # 确保顶级键不会重复添加
|
||||
# if top_level_key not in results:
|
||||
# results[top_level_key] = data[top_level_key]
|
||||
#
|
||||
# # 添加或更新父级键
|
||||
# if parent_key not in results:
|
||||
# if parent_key in data:
|
||||
# results[parent_key] = data[parent_key]
|
||||
#
|
||||
# # 添加当前子键和它的值
|
||||
# if subkey in data:
|
||||
# results[subkey] = data[subkey]
|
||||
#
|
||||
# return results
|
||||
def extract_json(data, target_values):
|
||||
results = {}
|
||||
|
||||
# 遍历所有目标值
|
||||
for target_value in target_values:
|
||||
# 找到所有与目标值匹配的键
|
||||
matched_keys = find_keys_by_value(target_value, data)
|
||||
|
||||
for key in matched_keys:
|
||||
# 查找所有以该键为前缀的子键,限制只提取直接子项
|
||||
key_and_subheadings = find_keys_with_prefix(key, data)
|
||||
|
||||
for subkey in key_and_subheadings:
|
||||
# 如果子键有多级结构(比如 '7.2.1'),并且是直接子项
|
||||
if "." in subkey:
|
||||
parent_key = subkey.rsplit('.', 1)[0]
|
||||
top_level_key = parent_key.split('.')[0] + '.'
|
||||
|
||||
# 确保顶级键不会重复添加
|
||||
# 特别处理定标相关的顶级键,确保不会重复添加其他键
|
||||
if top_level_key not in results:
|
||||
results[top_level_key] = data[top_level_key]
|
||||
|
||||
results[top_level_key] = target_value
|
||||
# 添加或更新父级键
|
||||
if parent_key not in results:
|
||||
if parent_key in data:
|
||||
results[parent_key] = data[parent_key]
|
||||
|
||||
# 添加当前子键和它的值
|
||||
if subkey in data:
|
||||
# 添加当前键
|
||||
results[subkey] = data[subkey]
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def sort_clean_data_keys(data):
|
||||
# 预处理:删除键名中的空格
|
||||
def preprocess_key(key):
|
||||
@ -243,12 +262,13 @@ def extract_from_notice(clause_path, type):
|
||||
return final_result
|
||||
|
||||
|
||||
# TODO: 再审视一下zbtest20的处理是否合理
|
||||
# TODO: extract_json新版本仍有问题,未知。
|
||||
if __name__ == "__main__":
|
||||
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
|
||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause9.json'
|
||||
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json"
|
||||
try:
|
||||
res = extract_from_notice(file_path, 3) # 可以改变此处的 type 参数测试不同的场景
|
||||
res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景
|
||||
res2 = json.dumps(res, ensure_ascii=False, indent=4)
|
||||
print(res2)
|
||||
except ValueError as e:
|
||||
|
@ -90,17 +90,30 @@ def parse_text_by_heading(text):
|
||||
lines = text.split('\n') #['一、说明', '1.适用范围', '招标文件仅适用于第一章“投标邀请书”中所述项目的货物、工程及服务的采购。']
|
||||
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
|
||||
line_stripped = line.strip()
|
||||
print("yes")
|
||||
print(line_stripped)
|
||||
# 匹配中文数字标题,包括带括号和不带括号的情况
|
||||
chinese_match = re.match(r'^(?:\s*[((]?\s*([一二三四五六七八九十]+)\s*[))]?\s*[、]*)?\s*(.+)$', line_stripped)
|
||||
pattern_title = re.compile(r'^\s*(?:[((]\s*[一二三四五六七八九十]\s*[))]\s*|[一二三四五六七八九十]\s*、\s*)')
|
||||
|
||||
# 用于提取中文数字部分
|
||||
pattern_key = re.compile(r'[一二三四五六七八九十]')
|
||||
|
||||
# 用于提取标题后的内容部分
|
||||
pattern_value = re.compile(r'[^\s、))]+')
|
||||
|
||||
chinese_match = pattern_title.match(line_stripped) # 匹配是否为标题
|
||||
if chinese_match:
|
||||
chinese_key, chinese_value = chinese_match.groups()
|
||||
# 匹配到标题行,先提取中文数字部分
|
||||
chinese_key_match = pattern_key.search(line_stripped)
|
||||
# 提取中文数字后的内容部分
|
||||
chinese_value_match = pattern_value.search(
|
||||
line_stripped[chinese_key_match.end():]) if chinese_key_match else None
|
||||
if chinese_key_match and chinese_value_match:
|
||||
chinese_key = chinese_key_match.group() # 匹配到的中文数字,如 "一"
|
||||
chinese_value = chinese_value_match.group() # 匹配到的标题内容,如 "招标文件"
|
||||
# 将提取的标题和内容存储到字典中
|
||||
if chinese_key:
|
||||
chinese_key = f"{chinese_key}" # 统一格式为"数字、"
|
||||
data[chinese_key] = chinese_value
|
||||
current_key = None
|
||||
current_content = []
|
||||
current_key = None # 重置当前key
|
||||
current_content = [] # 清空当前内容
|
||||
continue
|
||||
|
||||
# 新增:匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
|
||||
@ -210,7 +223,7 @@ def process_folder(input_folder, output_folder):
|
||||
|
||||
if __name__ == "__main__":
|
||||
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
|
||||
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\ztbfile_tobidders_notice_part2.pdf'
|
||||
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
|
||||
try:
|
||||
output_path = convert_clause_to_json(file_path,output_folder)
|
||||
|
Loading…
x
Reference in New Issue
Block a user