9.25

2024-09-25 10:45:32 +08:00 · 2024-09-25 10:45:32 +08:00 · 06a5cd03e1
commit 06a5cd03e1
parent bdfc17bb13
2 changed files with 30 additions and 7 deletions
--- a/flask_app/main/投标人须知正文条款提取成json文件.py
+++ b/flask_app/main/投标人须知正文条款提取成json文件.py
@ -80,6 +80,8 @@ def parse_text_by_heading(text):
    current_key = None
    current_content = []
    append_newline = False
+    sub_key_count = 0  # 用于生成二级标题的计数器
+    third_key_count = 0  # 用于生成三级标题的计数器

    lines = text.split('\n')
    for i, line in enumerate(lines):
@ -96,26 +98,48 @@ def parse_text_by_heading(text):
            if current_key is None or (compare_headings(current_key, new_key) and (
                    len(current_content) == 0 or current_content[-1][-1] != '第')):
                if current_key is not None:
-                    # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
+                    # 将之前的内容保存到 data 中
                    content_string = ''.join(current_content).strip()
                    data[current_key] = content_string.replace(' ', '')
                current_key = new_key
                current_content = [line_content]
-                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True
+                # 初始化二级和三级标题的计数器
+                sub_key_count = 0
+                third_key_count = 0
                append_newline = len(new_key.split('.')) == 2
            else:
                append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
        else:
+            # 匹配子序号 '1. 2.' 生成三级标题
+            sub_match = re.match(r'^(\d+)[\.\、]\s*(.*)$', line_stripped)
+            if sub_match:
+                sub_number, sub_content = sub_match.groups()
+                # 生成三级标题 key，如 5.0.1, 5.0.2 等
+                if current_key:
+                    if sub_key_count == 0:
+                        # 生成二级标题，如 5.0
+                        sub_key_count += 1
+                        sub_key = f"{current_key}.0"
+                        data[sub_key] = ''.join(current_content).strip()
+                        current_content = []  # 重置内容以生成新的内容
+                    third_key_count += 1
+                    sub_key_with_third = f"{current_key}.0.{third_key_count}"
+                    data[sub_key_with_third] = sub_content.strip()
+                continue
+            else:
+                # 处理一般行内容
                if line_stripped:
                    append_newline = handle_content_append(current_content, line_stripped, append_newline, keywords)

    if current_key is not None:
        # 保存最后一部分内容
        content_string = ''.join(current_content).strip()
+        if content_string:
            data[current_key] = content_string.replace(' ', '')

    return data

+
 def convert_to_json(file_path, start_word, end_phrases):
    if file_path.endswith('.docx'):
        text = extract_text_from_docx(file_path)
--- a/flask_app/main/招标文件解析.py
+++ b/flask_app/main/招标文件解析.py
@ -240,8 +240,7 @@ def main_processing(output_folder, downloaded_file_path, file_type, unique_id):

        # 每提交一个任务暂停1秒，确保任务逐个提交
        for task_name, future in futures.items():
-            futures[task_name] = executor.submit(future)
-            time.sleep(1)  # 任务提交后暂停1秒
+            time.sleep(1)  # 可以保留这个暂停

        # 按照任务完成的顺序返回结果
        for future in concurrent.futures.as_completed(futures.values()):