1.22 限制了招标公示网址的长度
This commit is contained in:
parent
702b8a39ef
commit
e4279e9f2d
@ -40,16 +40,17 @@ def compare_headings(current, new):
|
||||
|
||||
|
||||
def should_add_newline(content, keywords, max_length=20):
|
||||
#如果拼接后的字符串 content_str 中包含任何一个在 keywords 中的关键词,或者 content_str 的长度小于或等于 max_length,则返回 True,表示应该添加换行符。
|
||||
content_str = ''.join(content).strip()
|
||||
return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length
|
||||
|
||||
|
||||
def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section):
|
||||
if append_newline:
|
||||
if should_add_newline(current_content, keywords):
|
||||
current_content.append('\n') # 添加换行符
|
||||
if should_add_newline(current_content, keywords): #current_content:['费用承担'] line_content:'投标人准备和参加投标活动发生的费用自理。'
|
||||
current_content.append('\n') # 添加换行符 #current_content:['费用承担', '\n']
|
||||
append_newline = False
|
||||
current_content.append(line_content)
|
||||
current_content.append(line_content) #current_content:['费用承担', '\n', '投标人准备和参加投标活动发生的费用自理。']
|
||||
if in_special_section:
|
||||
current_content.append('\n')
|
||||
return append_newline
|
||||
@ -75,7 +76,7 @@ def parse_text_by_heading(text):
|
||||
|
||||
initial_heading_pattern = None
|
||||
special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定',
|
||||
'包括以下','包括下列','情形之一','情况之一'] # 定义特殊章节关键词
|
||||
'以下','情形之一','情况之一','下列'] # 定义特殊章节关键词
|
||||
in_special_section = False # 标志是否在特殊章节中
|
||||
lines = text.split('\n')
|
||||
|
||||
@ -120,7 +121,7 @@ def parse_text_by_heading(text):
|
||||
break
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
in_double_hash_mode = False
|
||||
in_double_hash_mode = False #标志是否位于页码第一行
|
||||
line_stripped = line.strip().replace('.', '.')
|
||||
if line_stripped.startswith("##"):
|
||||
in_double_hash_mode = True
|
||||
@ -182,7 +183,7 @@ def parse_text_by_heading(text):
|
||||
data[current_key] = data.get(current_key, '') + content_string
|
||||
current_key = new_key
|
||||
current_content = [line_content]
|
||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2
|
||||
append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(keyword in line_content for keyword in special_section_keywords)
|
||||
last_main_number = new_key.split('.')[0]
|
||||
else:
|
||||
append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords,
|
||||
@ -193,6 +194,11 @@ def parse_text_by_heading(text):
|
||||
# 处理以点号开头并带有数字的情况
|
||||
sub_number, line_content = dot_match.groups()
|
||||
sub_number = sub_number.replace('.', '.')
|
||||
# 检查是否进入或退出特殊章节
|
||||
if any(keyword in line_content for keyword in special_section_keywords):
|
||||
in_special_section = True
|
||||
elif in_special_section:
|
||||
in_special_section = False
|
||||
if last_main_number:
|
||||
new_key = f"{last_main_number}.{sub_number}"
|
||||
if current_key is not None:
|
||||
@ -211,6 +217,11 @@ def parse_text_by_heading(text):
|
||||
if in_double_hash_mode:
|
||||
# 处理以点号开头但不带数字的情况,自动生成下一个一级序号
|
||||
temp_content = dot_text_match.group(1).strip()
|
||||
# 检查是否进入或退出特殊章节
|
||||
if any(keyword in temp_content for keyword in special_section_keywords):
|
||||
in_special_section = True
|
||||
elif in_special_section:
|
||||
in_special_section = False
|
||||
if last_main_number:
|
||||
# 生成下一个一级序号
|
||||
try:
|
||||
|
@ -269,6 +269,8 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
|
||||
# 货物标和工程标的资格审查整合
|
||||
##TODO:陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf 唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf 不好搞
|
||||
# 无法判断用户上传的是否为乱码文件,可以考虑并行调用大模型,如果为乱码文件直接return None
|
||||
|
||||
#截取json文件有些问题:C:\Users\Administrator\Desktop\新建文件夹 (3)\test keywords和special...
|
||||
if __name__ == "__main__":
|
||||
# 配置日志器
|
||||
unique_id = "uuidzyzy11"
|
||||
|
Loading…
x
Reference in New Issue
Block a user