This commit is contained in:
zy123 2024-09-30 17:52:59 +08:00
parent 99349a9968
commit e017acc673
2 changed files with 64 additions and 31 deletions

View File

@ -18,40 +18,59 @@ def find_keys_with_prefix(key_prefix, json_data):
# 从完整的json文件中读取所需数据eg:投标、评标 # 从完整的json文件中读取所需数据eg:投标、评标
# def extract_json(data, target_values):
# results = {}
#
# # 遍历所有目标值
# for target_value in target_values:
# # 找到所有与目标值匹配的键
# matched_keys = find_keys_by_value(target_value, data)
#
# for key in matched_keys:
# # 查找所有以该键为前缀的子键,限制只提取直接子项
# key_and_subheadings = find_keys_with_prefix(key, data)
#
# for subkey in key_and_subheadings:
# # 如果子键有多级结构(比如 '7.2.1'),并且是直接子项
# if "." in subkey:
# parent_key = subkey.rsplit('.', 1)[0]
# top_level_key = parent_key.split('.')[0] + '.'
#
# # 确保顶级键不会重复添加
# if top_level_key not in results:
# results[top_level_key] = data[top_level_key]
#
# # 添加或更新父级键
# if parent_key not in results:
# if parent_key in data:
# results[parent_key] = data[parent_key]
#
# # 添加当前子键和它的值
# if subkey in data:
# results[subkey] = data[subkey]
#
# return results
def extract_json(data, target_values): def extract_json(data, target_values):
results = {} results = {}
# 遍历所有目标值
for target_value in target_values: for target_value in target_values:
# 找到所有与目标值匹配的键
matched_keys = find_keys_by_value(target_value, data) matched_keys = find_keys_by_value(target_value, data)
for key in matched_keys: for key in matched_keys:
# 查找所有以该键为前缀的子键,限制只提取直接子项
key_and_subheadings = find_keys_with_prefix(key, data) key_and_subheadings = find_keys_with_prefix(key, data)
for subkey in key_and_subheadings: for subkey in key_and_subheadings:
# 如果子键有多级结构(比如 '7.2.1'),并且是直接子项
if "." in subkey: if "." in subkey:
parent_key = subkey.rsplit('.', 1)[0] parent_key = subkey.rsplit('.', 1)[0]
top_level_key = parent_key.split('.')[0] + '.' top_level_key = parent_key.split('.')[0] + '.'
# 特别处理定标相关的顶级键,确保不会重复添加其他键
# 确保顶级键不会重复添加
if top_level_key not in results: if top_level_key not in results:
results[top_level_key] = data[top_level_key] results[top_level_key] = target_value
# 添加或更新父级键 # 添加或更新父级键
if parent_key not in results: if parent_key not in results:
if parent_key in data: if parent_key in data:
results[parent_key] = data[parent_key] results[parent_key] = data[parent_key]
# 添加当前键
# 添加当前子键和它的值 results[subkey] = data[subkey]
if subkey in data:
results[subkey] = data[subkey]
return results return results
def sort_clean_data_keys(data): def sort_clean_data_keys(data):
# 预处理:删除键名中的空格 # 预处理:删除键名中的空格
def preprocess_key(key): def preprocess_key(key):
@ -243,12 +262,13 @@ def extract_from_notice(clause_path, type):
return final_result return final_result
# TODO: 再审视一下zbtest20的处理是否合理 # TODO: extract_json新版本仍有问题未知。
if __name__ == "__main__": if __name__ == "__main__":
file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json' # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause9.json' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1\\clause9.json'
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json"
try: try:
res = extract_from_notice(file_path, 3) # 可以改变此处的 type 参数测试不同的场景 res = extract_from_notice(file_path, 2) # 可以改变此处的 type 参数测试不同的场景
res2 = json.dumps(res, ensure_ascii=False, indent=4) res2 = json.dumps(res, ensure_ascii=False, indent=4)
print(res2) print(res2)
except ValueError as e: except ValueError as e:

View File

@ -90,18 +90,31 @@ def parse_text_by_heading(text):
lines = text.split('\n') #['一、说明', '1.适用范围', '招标文件仅适用于第一章“投标邀请书”中所述项目的货物、工程及服务的采购。'] lines = text.split('\n') #['一、说明', '1.适用范围', '招标文件仅适用于第一章“投标邀请书”中所述项目的货物、工程及服务的采购。']
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n' for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
line_stripped = line.strip() line_stripped = line.strip()
print("yes")
print(line_stripped)
# 匹配中文数字标题,包括带括号和不带括号的情况 # 匹配中文数字标题,包括带括号和不带括号的情况
chinese_match = re.match(r'^(?:\s*[(]?\s*([一二三四五六七八九十]+)\s*[)]?\s*[、]*)?\s*(.+)$', line_stripped) pattern_title = re.compile(r'^\s*(?:[(]\s*[一二三四五六七八九十]\s*[)]\s*|[一二三四五六七八九十]\s*、\s*)')
# 用于提取中文数字部分
pattern_key = re.compile(r'[一二三四五六七八九十]')
# 用于提取标题后的内容部分
pattern_value = re.compile(r'[^\s、)]+')
chinese_match = pattern_title.match(line_stripped) # 匹配是否为标题
if chinese_match: if chinese_match:
chinese_key, chinese_value = chinese_match.groups() # 匹配到标题行,先提取中文数字部分
if chinese_key: chinese_key_match = pattern_key.search(line_stripped)
chinese_key = f"{chinese_key}" # 统一格式为"数字、" # 提取中文数字后的内容部分
data[chinese_key] = chinese_value chinese_value_match = pattern_value.search(
current_key = None line_stripped[chinese_key_match.end():]) if chinese_key_match else None
current_content = [] if chinese_key_match and chinese_value_match:
continue chinese_key = chinese_key_match.group() # 匹配到的中文数字,如 "一"
chinese_value = chinese_value_match.group() # 匹配到的标题内容,如 "招标文件"
# 将提取的标题和内容存储到字典中
if chinese_key:
data[chinese_key] = chinese_value
current_key = None # 重置当前key
current_content = [] # 清空当前内容
continue
# 新增:匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改" # 新增:匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped) arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
@ -210,7 +223,7 @@ def process_folder(input_folder, output_folder):
if __name__ == "__main__": if __name__ == "__main__":
# file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf' # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\ztbfile_tobidders_notice_part2.pdf' file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目_tobidders_notice_part2.pdf'
output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1' output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
try: try:
output_path = convert_clause_to_json(file_path,output_folder) output_path = convert_clause_to_json(file_path,output_folder)