1.22 限制了招标公示网址的长度

This commit is contained in:
zy123 2025-01-23 10:16:56 +08:00
parent 702b8a39ef
commit e4279e9f2d
2 changed files with 19 additions and 6 deletions

View File

@ -40,16 +40,17 @@ def compare_headings(current, new):
def should_add_newline(content, keywords, max_length=20): def should_add_newline(content, keywords, max_length=20):
#如果拼接后的字符串 content_str 中包含任何一个在 keywords 中的关键词,或者 content_str 的长度小于或等于 max_length则返回 True表示应该添加换行符。
content_str = ''.join(content).strip() content_str = ''.join(content).strip()
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section): def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section):
if append_newline: if append_newline:
if should_add_newline(current_content, keywords): if should_add_newline(current_content, keywords): #current_content:['费用承担'] line_content:'投标人准备和参加投标活动发生的费用自理。'
current_content.append('\n') # 添加换行符 current_content.append('\n') # 添加换行符 #current_content:['费用承担', '\n']
append_newline = False append_newline = False
current_content.append(line_content) current_content.append(line_content) #current_content:['费用承担', '\n', '投标人准备和参加投标活动发生的费用自理。']
if in_special_section: if in_special_section:
current_content.append('\n') current_content.append('\n')
return append_newline return append_newline
@ -75,7 +76,7 @@ def parse_text_by_heading(text):
initial_heading_pattern = None initial_heading_pattern = None
special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定', special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定',
'包括以下','包括下列','情形之一','情况之一'] # 定义特殊章节关键词 '以下','情形之一','情况之一','下列'] # 定义特殊章节关键词
in_special_section = False # 标志是否在特殊章节中 in_special_section = False # 标志是否在特殊章节中
lines = text.split('\n') lines = text.split('\n')
@ -120,7 +121,7 @@ def parse_text_by_heading(text):
break break
for i, line in enumerate(lines): for i, line in enumerate(lines):
in_double_hash_mode = False in_double_hash_mode = False #标志是否位于页码第一行
line_stripped = line.strip().replace('', '.') line_stripped = line.strip().replace('', '.')
if line_stripped.startswith("##"): if line_stripped.startswith("##"):
in_double_hash_mode = True in_double_hash_mode = True
@ -182,7 +183,7 @@ def parse_text_by_heading(text):
data[current_key] = data.get(current_key, '') + content_string data[current_key] = data.get(current_key, '') + content_string
current_key = new_key current_key = new_key
current_content = [line_content] current_content = [line_content]
append_newline = len(new_key.rstrip('.').split('.')) <= 2 append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(keyword in line_content for keyword in special_section_keywords)
last_main_number = new_key.split('.')[0] last_main_number = new_key.split('.')[0]
else: else:
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
@ -193,6 +194,11 @@ def parse_text_by_heading(text):
# 处理以点号开头并带有数字的情况 # 处理以点号开头并带有数字的情况
sub_number, line_content = dot_match.groups() sub_number, line_content = dot_match.groups()
sub_number = sub_number.replace('', '.') sub_number = sub_number.replace('', '.')
# 检查是否进入或退出特殊章节
if any(keyword in line_content for keyword in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
if last_main_number: if last_main_number:
new_key = f"{last_main_number}.{sub_number}" new_key = f"{last_main_number}.{sub_number}"
if current_key is not None: if current_key is not None:
@ -211,6 +217,11 @@ def parse_text_by_heading(text):
if in_double_hash_mode: if in_double_hash_mode:
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号 # 处理以点号开头但不带数字的情况,自动生成下一个一级序号
temp_content = dot_text_match.group(1).strip() temp_content = dot_text_match.group(1).strip()
# 检查是否进入或退出特殊章节
if any(keyword in temp_content for keyword in special_section_keywords):
in_special_section = True
elif in_special_section:
in_special_section = False
if last_main_number: if last_main_number:
# 生成下一个一级序号 # 生成下一个一级序号
try: try:

View File

@ -269,6 +269,8 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
# 货物标和工程标的资格审查整合 # 货物标和工程标的资格审查整合
##TODO:陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf 唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf 不好搞 ##TODO:陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf 唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf 不好搞
# 无法判断用户上传的是否为乱码文件,可以考虑并行调用大模型,如果为乱码文件直接return None # 无法判断用户上传的是否为乱码文件,可以考虑并行调用大模型,如果为乱码文件直接return None
#截取json文件有些问题C:\Users\Administrator\Desktop\新建文件夹 (3)\test keywords和special...
if __name__ == "__main__": if __name__ == "__main__":
# 配置日志器 # 配置日志器
unique_id = "uuidzyzy11" unique_id = "uuidzyzy11"