From e4279e9f2d8d22672720ede964e68d09beae3ae0 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 23 Jan 2025 10:16:56 +0800 Subject: [PATCH] =?UTF-8?q?1.22=20=E9=99=90=E5=88=B6=E4=BA=86=E6=8B=9B?= =?UTF-8?q?=E6=A0=87=E5=85=AC=E7=A4=BA=E7=BD=91=E5=9D=80=E7=9A=84=E9=95=BF?= =?UTF-8?q?=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../投标人须知正文条款提取成json文件.py | 23 ++++++++++++++----- flask_app/routes/货物标解析main.py | 2 ++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/flask_app/general/投标人须知正文条款提取成json文件.py b/flask_app/general/投标人须知正文条款提取成json文件.py index 918f6bd..64101ac 100644 --- a/flask_app/general/投标人须知正文条款提取成json文件.py +++ b/flask_app/general/投标人须知正文条款提取成json文件.py @@ -40,16 +40,17 @@ def compare_headings(current, new): def should_add_newline(content, keywords, max_length=20): + #如果拼接后的字符串 content_str 中包含任何一个在 keywords 中的关键词,或者 content_str 的长度小于或等于 max_length,则返回 True,表示应该添加换行符。 content_str = ''.join(content).strip() return any(keyword in content_str for keyword in keywords) or len(content_str) <= max_length def handle_content_append(current_content, line_content, append_newline, keywords, in_special_section): if append_newline: - if should_add_newline(current_content, keywords): - current_content.append('\n') # 添加换行符 + if should_add_newline(current_content, keywords): #current_content:['费用承担'] line_content:'投标人准备和参加投标活动发生的费用自理。' + current_content.append('\n') # 添加换行符 #current_content:['费用承担', '\n'] append_newline = False - current_content.append(line_content) + current_content.append(line_content) #current_content:['费用承担', '\n', '投标人准备和参加投标活动发生的费用自理。'] if in_special_section: current_content.append('\n') return append_newline @@ -75,7 +76,7 @@ def parse_text_by_heading(text): initial_heading_pattern = None special_section_keywords = ['文件的组成', '文件的构成', '文件包括:', '文件包括:', '雷同认定', - '包括以下','包括下列','情形之一','情况之一'] # 定义特殊章节关键词 + '以下','情形之一','情况之一','下列'] # 定义特殊章节关键词 in_special_section = False # 标志是否在特殊章节中 lines = text.split('\n') @@ -120,7 +121,7 @@ def parse_text_by_heading(text): break for i, line in enumerate(lines): - in_double_hash_mode = False + in_double_hash_mode = False #标志是否位于页码第一行 line_stripped = line.strip().replace('.', '.') if line_stripped.startswith("##"): in_double_hash_mode = True @@ -182,7 +183,7 @@ def parse_text_by_heading(text): data[current_key] = data.get(current_key, '') + content_string current_key = new_key current_content = [line_content] - append_newline = len(new_key.rstrip('.').split('.')) <= 2 + append_newline = len(new_key.rstrip('.').split('.')) <= 2 or any(keyword in line_content for keyword in special_section_keywords) last_main_number = new_key.split('.')[0] else: append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords, @@ -193,6 +194,11 @@ def parse_text_by_heading(text): # 处理以点号开头并带有数字的情况 sub_number, line_content = dot_match.groups() sub_number = sub_number.replace('.', '.') + # 检查是否进入或退出特殊章节 + if any(keyword in line_content for keyword in special_section_keywords): + in_special_section = True + elif in_special_section: + in_special_section = False if last_main_number: new_key = f"{last_main_number}.{sub_number}" if current_key is not None: @@ -211,6 +217,11 @@ def parse_text_by_heading(text): if in_double_hash_mode: # 处理以点号开头但不带数字的情况,自动生成下一个一级序号 temp_content = dot_text_match.group(1).strip() + # 检查是否进入或退出特殊章节 + if any(keyword in temp_content for keyword in special_section_keywords): + in_special_section = True + elif in_special_section: + in_special_section = False if last_main_number: # 生成下一个一级序号 try: diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index 1194aad..da2e454 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -269,6 +269,8 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id): # 货物标和工程标的资格审查整合 ##TODO:陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目(1)_tobidders_notice_part2.pdf 唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1)_tobidders_notice_part1.pdf 不好搞 # 无法判断用户上传的是否为乱码文件,可以考虑并行调用大模型,如果为乱码文件直接return None + +#截取json文件有些问题:C:\Users\Administrator\Desktop\新建文件夹 (3)\test keywords和special... if __name__ == "__main__": # 配置日志器 unique_id = "uuidzyzy11"