176 lines
8.3 KiB
Python
Raw Normal View History

2024-10-09 13:50:28 +08:00
import json
import re
2024-09-30 16:23:39 +08:00
2024-10-09 13:50:28 +08:00
def compare_headings(current, new):
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n')
append_newline = False
current_content.append(line_content)
return append_newline
# def parse_text_by_heading(text):
# keywords = ['包含', '以下']
# data = {}
# current_key = None
# current_content = []
# append_newline = False
# skip_subheadings = False
#
# lines = text.split('\n')
# for i, line in enumerate(lines):
# line_stripped = line.strip().replace('', '.')
# # print(line_stripped)
# # 匹配二级、三级标题形如 '1.1'、'2.2.3' 并确保其前后没有字母或括号
# match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
# if not match:
# #匹配一级标题'1.'
# match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
#
# if match:
# new_key, line_content = match.groups() #new_key:1.1 line_content:本次竞争性磋商文件仅适用于本次竞争性磋商公告中所叙述的广水农商行门禁控制主机及基础验证设备采购项
# line_content = line_content.lstrip('..、,') #删除左边的若干个..、,
#
# # 检查是否包含关键词,决定是否跳过子标题
# if any(keyword in line_content for keyword in keywords):
# skip_subheadings = True
# else:
# skip_subheadings = False
#
# # 检查是否应该更新当前键和内容
# if current_key is None or (compare_headings(current_key, new_key) and (
# len(current_content) == 0 or current_content[-1][-1] != '第')):
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
# current_key = new_key
# current_content = [line_content]
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
# else:
# append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
# else:
# if not skip_subheadings:
# # 匹配中文数字标题,包括带括号和不带括号的情况
# pattern_title = re.compile(
# r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
# chinese_match = pattern_title.match(line_stripped)
#
# # 匹配字母标题,如 B.磋商说明
# letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
#
# # 匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
# arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
#
# if chinese_match or letter_match or arabic_match:
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
# if chinese_match:
# pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
# pattern_value = re.compile(r'[^\s、)]+')
# chinese_key_match = pattern_key.search(line_stripped)
# chinese_value_match = pattern_value.search(
# line_stripped[chinese_key_match.end():]) if chinese_key_match else None
# if chinese_key_match and chinese_value_match:
# current_key = chinese_key_match.group()
# current_content = [chinese_value_match.group()]
# elif letter_match:
# letter_key, letter_value = letter_match.groups()
# # 将字母转换为对应的中文数字
# letter_to_chinese = {
# 'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
# 'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
# 'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
# }
# current_key = letter_to_chinese.get(letter_key, letter_key)
# current_content = [letter_value]
# elif arabic_match:
# arabic_key, arabic_value = arabic_match.groups()
# current_key = arabic_key.replace('、', '.')
# current_content = [arabic_value]
#
# append_newline = True
# continue
#
# if line_stripped:
# append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
#
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
#
# return data
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_content = []
append_newline = False
lines = text.split('\n')
for i, line in enumerate(lines): #由于本身就是按行读取处理,因此保存的时候不带'\n'
2024-10-09 13:50:28 +08:00
line_stripped = line.strip().replace('', '.')
# 匹配形如 '1.1'、'2.2.3' 等至少包含一个点的标题,并确保其前后没有字母或括号
2024-10-09 13:50:28 +08:00
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('.')
# 检查是否应该更新当前键和内容
2024-10-09 13:50:28 +08:00
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')): #current_content 内容占两行则是: ['本招标项目已具备招标条件,现对本项目施工监理进行招标,项目概况见本', '章附件一。']
2024-10-09 13:50:28 +08:00
if current_key is not None:
# 将之前的内容保存到data中保留第一个换行符后续的换行符转为空字符
2024-10-09 13:50:28 +08:00
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
2024-10-09 13:50:28 +08:00
current_key = new_key
current_content = [line_content]
# 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True
# append_newline = len(new_key.split('.')) == 2
2024-10-09 13:50:28 +08:00
append_newline = len(new_key.rstrip('.').split('.')) <= 2
else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
2024-10-09 13:50:28 +08:00
else:
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
# 保存最后一部分内容
2024-10-09 13:50:28 +08:00
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
return data
2024-10-09 13:50:28 +08:00
# 测试文本
text = """
22.3 从提交响应文件截止时间至磋商有效期期满这段时间供应商不得修改第
23.1 条的内容
2024-10-09 13:50:28 +08:00
"""
res = parse_text_by_heading(text)
print(json.dumps(res, ensure_ascii=False, indent=4))