diff --git a/flask_app/main/投标人须知正文条款提取成json文件.py b/flask_app/main/投标人须知正文条款提取成json文件.py index 487715c..7d86235 100644 --- a/flask_app/main/投标人须知正文条款提取成json文件.py +++ b/flask_app/main/投标人须知正文条款提取成json文件.py @@ -80,6 +80,8 @@ def parse_text_by_heading(text): current_key = None current_content = [] append_newline = False + sub_key_count = 0 # 用于生成二级标题的计数器 + third_key_count = 0 # 用于生成三级标题的计数器 lines = text.split('\n') for i, line in enumerate(lines): @@ -96,26 +98,48 @@ def parse_text_by_heading(text): if current_key is None or (compare_headings(current_key, new_key) and ( len(current_content) == 0 or current_content[-1][-1] != '第')): if current_key is not None: - # 将之前的内容保存到data中,保留第一个换行符,后续的换行符转为空字符 + # 将之前的内容保存到 data 中 content_string = ''.join(current_content).strip() data[current_key] = content_string.replace(' ', '') current_key = new_key current_content = [line_content] - # 只有当标题级别为两级(如 1.1)时,才设置 append_newline 为 True + # 初始化二级和三级标题的计数器 + sub_key_count = 0 + third_key_count = 0 append_newline = len(new_key.split('.')) == 2 else: append_newline = handle_content_append(current_content, line_content, append_newline, keywords) else: - if line_stripped: - append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) + # 匹配子序号 '1. 2.' 生成三级标题 + sub_match = re.match(r'^(\d+)[\.\、]\s*(.*)$', line_stripped) + if sub_match: + sub_number, sub_content = sub_match.groups() + # 生成三级标题 key,如 5.0.1, 5.0.2 等 + if current_key: + if sub_key_count == 0: + # 生成二级标题,如 5.0 + sub_key_count += 1 + sub_key = f"{current_key}.0" + data[sub_key] = ''.join(current_content).strip() + current_content = [] # 重置内容以生成新的内容 + third_key_count += 1 + sub_key_with_third = f"{current_key}.0.{third_key_count}" + data[sub_key_with_third] = sub_content.strip() + continue + else: + # 处理一般行内容 + if line_stripped: + append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords) if current_key is not None: # 保存最后一部分内容 content_string = ''.join(current_content).strip() - data[current_key] = content_string.replace(' ', '') + if content_string: + data[current_key] = content_string.replace(' ', '') return data + def convert_to_json(file_path, start_word, end_phrases): if file_path.endswith('.docx'): text = extract_text_from_docx(file_path) diff --git a/flask_app/main/招标文件解析.py b/flask_app/main/招标文件解析.py index b496eaf..58c1bc5 100644 --- a/flask_app/main/招标文件解析.py +++ b/flask_app/main/招标文件解析.py @@ -240,8 +240,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id): # 每提交一个任务暂停1秒,确保任务逐个提交 for task_name, future in futures.items(): - futures[task_name] = executor.submit(future) - time.sleep(1) # 任务提交后暂停1秒 + time.sleep(1) # 可以保留这个暂停 # 按照任务完成的顺序返回结果 for future in concurrent.futures.as_completed(futures.values()):