262 lines
13 KiB
Python
Raw Normal View History

2024-10-09 13:50:28 +08:00
import json
import re
2024-09-30 16:23:39 +08:00
2024-10-09 13:50:28 +08:00
def compare_headings(current, new):
current_nums = [int(num) for num in current.split('.') if num.isdigit()]
new_nums = [int(num) for num in new.split('.') if num.isdigit()]
for c, n in zip(current_nums, new_nums):
if n > c:
return True
elif n < c:
return False
return len(new_nums) > len(current_nums)
def should_add_newline(content, keywords, max_length=20):
content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords):
if append_newline:
if should_add_newline(current_content, keywords):
current_content.append('\n')
append_newline = False
current_content.append(line_content)
return append_newline
# def parse_text_by_heading(text):
# keywords = ['包含', '以下']
# data = {}
# current_key = None
# current_content = []
# append_newline = False
# skip_subheadings = False
#
# lines = text.split('\n')
# for i, line in enumerate(lines):
# line_stripped = line.strip().replace('', '.')
# # print(line_stripped)
# # 匹配二级、三级标题形如 '1.1'、'2.2.3' 并确保其前后没有字母或括号
# match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
# if not match:
# #匹配一级标题'1.'
# match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
#
# if match:
# new_key, line_content = match.groups() #new_key:1.1 line_content:本次竞争性磋商文件仅适用于本次竞争性磋商公告中所叙述的广水农商行门禁控制主机及基础验证设备采购项
# line_content = line_content.lstrip('..、,') #删除左边的若干个..、,
#
# # 检查是否包含关键词,决定是否跳过子标题
# if any(keyword in line_content for keyword in keywords):
# skip_subheadings = True
# else:
# skip_subheadings = False
#
# # 检查是否应该更新当前键和内容
# if current_key is None or (compare_headings(current_key, new_key) and (
# len(current_content) == 0 or current_content[-1][-1] != '第')):
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
# current_key = new_key
# current_content = [line_content]
# append_newline = len(new_key.rstrip('.').split('.')) <= 2
# else:
# append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
# else:
# if not skip_subheadings:
# # 匹配中文数字标题,包括带括号和不带括号的情况
# pattern_title = re.compile(
# r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
# chinese_match = pattern_title.match(line_stripped)
#
# # 匹配字母标题,如 B.磋商说明
# letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
#
# # 匹配阿拉伯数字开头后跟顿号的情况,如 "7、竞争性磋商采购文件的修改"
# arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
#
# if chinese_match or letter_match or arabic_match:
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
# if chinese_match:
# pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
# pattern_value = re.compile(r'[^\s、)]+')
# chinese_key_match = pattern_key.search(line_stripped)
# chinese_value_match = pattern_value.search(
# line_stripped[chinese_key_match.end():]) if chinese_key_match else None
# if chinese_key_match and chinese_value_match:
# current_key = chinese_key_match.group()
# current_content = [chinese_value_match.group()]
# elif letter_match:
# letter_key, letter_value = letter_match.groups()
# # 将字母转换为对应的中文数字
# letter_to_chinese = {
# 'A': '一', 'B': '二', 'C': '三', 'D': '四', 'E': '五',
# 'F': '六', 'G': '七', 'H': '八', 'I': '九', 'J': '十',
# 'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
# }
# current_key = letter_to_chinese.get(letter_key, letter_key)
# current_content = [letter_value]
# elif arabic_match:
# arabic_key, arabic_value = arabic_match.groups()
# current_key = arabic_key.replace('、', '.')
# current_content = [arabic_value]
#
# append_newline = True
# continue
#
# if line_stripped:
# append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
#
# if current_key is not None:
# content_string = ''.join(current_content).strip()
# data[current_key] = content_string.replace(' ', '')
#
# return data
def parse_text_by_heading(text):
keywords = ['包含', '以下']
data = {}
current_key = None
current_key_chinese=None
current_content = []
append_newline = False
skip_subheadings = False
lines = text.split('\n')
for i, line in enumerate(lines):
line_stripped = line.strip().replace('', '.')
match = re.match(r'^(?<![a-zA-Z(])(\d+(?:\.\d+)+)\s*(.*)', line_stripped)
if not match:
match = re.match(r'^(\d+\.)\s*(.+)$', line_stripped)
if match:
new_key, line_content = match.groups()
line_content = line_content.lstrip('..、,')
if any(keyword in line_content for keyword in keywords):
skip_subheadings = True
else:
skip_subheadings = False
if current_key_chinese is not None:
content_string = ''.join(current_content).strip()
data[current_key_chinese] = content_string.replace(' ', '')
current_key_chinese=None
if current_key is None or (compare_headings(current_key, new_key) and (
len(current_content) == 0 or current_content[-1][-1] != '')):
if current_key is not None:
content_string = ''.join(current_content).strip()
if current_key in data:
data[current_key] += content_string.replace(' ', '')
else:
data[current_key] = content_string.replace(' ', '')
current_key = new_key
current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2
else:
append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
else:
if not skip_subheadings:
pattern_title = re.compile(
r'^\s*(?:[(]\s*([一二三四五六七八九十]{1,2})\s*[)]\s*|([一二三四五六七八九十]{1,2})\s*、\s*)')
chinese_match = pattern_title.match(line_stripped)
letter_match = re.match(r'^([A-Z])\.\s*(.*)$', line_stripped)
arabic_match = re.match(r'^(\d+、)\s*(.+)$', line_stripped)
if chinese_match or letter_match or arabic_match:
# if current_key is not None or current_key_chinese is not None:
# key = current_key if current_key is not None else current_key_chinese
# content_string = ''.join(current_content).strip() #TODO: key chinese key
# if key in data:
# data[key] +=content_string.replace(' ', '')
# else:
# data[key] = content_string.replace(' ', '')
# if current_key_chinese is not None:
# current_key_chinese = None
if chinese_match:
pattern_key = re.compile(r'[一二三四五六七八九十]{1,2}')
pattern_value = re.compile(r'[^\s、)]+')
chinese_key_match = pattern_key.search(line_stripped)
chinese_value_match = pattern_value.search(
line_stripped[chinese_key_match.end():]) if chinese_key_match else None
if chinese_key_match and chinese_value_match:
# current_key = chinese_key_match.group()
current_key_chinese=chinese_key_match.group()
current_content = [chinese_value_match.group()]
elif letter_match:
letter_key, letter_value = letter_match.groups()
letter_to_chinese = {
'A': '', 'B': '', 'C': '', 'D': '', 'E': '',
'F': '', 'G': '', 'H': '', 'I': '', 'J': '',
'K': '十一', 'L': '十二', 'M': '十三', 'N': '十四', 'O': '十五'
}
# current_key = letter_to_chinese.get(letter_key, letter_key)
current_key_chinese =letter_to_chinese.get(letter_key, letter_key)
current_content = [letter_value]
elif arabic_match:
arabic_key, arabic_value = arabic_match.groups()
current_key = arabic_key.replace('', '.')
current_content = [arabic_value]
append_newline = True
continue
if line_stripped:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)
if current_key is not None:
content_string = ''.join(current_content).strip()
data[current_key] = content_string.replace(' ', '')
# if current_key_chinese is not None:
# content_string = ''.join(current_content).strip()
# if current_key_chinese in data:
# data[current_key] += content_string.replace(' ', '')
# else:
# data[current_key_chinese] = content_string.replace(' ', '')
return data
# 测试文本
text = """
在这种
哈哈哈
红红火火恍恍惚惚
竞争性磋商响应文件的递交
19.竞争性磋商响应文件的加密
19.1 供应商应当按照要求制作电子磋商响应文件并在投标时上传加密的电子磋
商响应文件未加密的电子磋商响应文件采购人电子交易平台将拒收
并提示
22.3 从提交响应文件截止时间至磋商有效期期满这段时间供应商不得修改或
撤销其响应文件供应商所提交的响应文件在磋商结束后无论成交与否都不退
竞争性磋商开标时间和地点
供应商在规定的磋商截止时间开标时间电子交易平台上公开进行
开标所有供应商均应当准时在线参加开标
采购人采购代理机构通过互联网在磋商须知前附表规定的地点组织开标
并在投标截止时间 30 分钟前使用 CA 数字证书登录电子交易平台进入
开标大厅选择相应标段作在线开标的准备工作
供应商应当在能够保证设施设备可靠互联网畅通的任意地点通过互联网
在线参加开标在投标截止时间前使用加密其投标文件的 CA 数字证书登录
子交易平台进入开标大厅选择所投标段进行签到并实时在线关注招标
人的操作情况
:开标结束后请各投标人不要长时间离开电脑随时关注业务系统
待专家资格性审查和符合性审查完成后要求的最后报价若在规定时间内投标人
没有按要求进行多轮报价则作自动弃权处理
磋商程序及步骤
23.竞争性磋商小组
"""
res = parse_text_by_heading(text)
print(json.dumps(res, ensure_ascii=False, indent=4))