1.10

2025-01-10 10:06:12 +08:00 · 2025-01-10 10:06:12 +08:00 · 66b5f78cbb
commit 66b5f78cbb
parent 72d787c899
1 changed files with 31 additions and 21 deletions
--- a/flask_app/general/投标人须知正文提取指定内容.py
+++ b/flask_app/general/投标人须知正文提取指定内容.py
@ -5,8 +5,9 @@ from flask_app.general.json_utils import clean_json_string
 from flask_app.general.model_continue_query import process_continue_answers
 from flask_app.general.通义千问long import upload_file, qianwen_long_stream

-#提取两个大标题之间的内容
-def extract_between_sections(data, target_values):
+
+# 提取两个大标题之间的内容
+def extract_between_sections(data, target_values, flag=False):
    target_found = False
    extracted_data = {}
    current_section_title = ""
@ -17,7 +18,7 @@ def extract_between_sections(data, target_values):
    # 遍历所有键值对
    for key, value in data.items():
        # 只匹配形如 "一": "竞争性磋商响应文件" 的章节标题
-        if section_pattern.match(key):           #匹配到大标题...
+        if section_pattern.match(key):  # 匹配到大标题...
            if target_found:
                # 如果已经找到了符合的章节，并且遇到了另一个章节
                # 保存当前块并重置
@ -27,9 +28,10 @@ def extract_between_sections(data, target_values):
                target_found = False

            # 检查当前标题是否包含 target_values 中的任意关键词
-            if any(tv in value for tv in target_values) and not file_pattern.search(value):
-                target_found = True  # 找到了目标章节，开始捕获后续内容
-                current_section_title = value  # 保存章节标题内容
+            if any(tv in value for tv in target_values):
+                if (flag and not file_pattern.search(value)) or not flag:
+                    target_found = True  # 找到了目标章节，开始捕获后续内容
+                    current_section_title = value  # 保存章节标题内容

        elif target_found:  # 匹配到普通序号...
            current_block[key] = value
@ -40,7 +42,8 @@ def extract_between_sections(data, target_values):

    return extracted_data

-#生成无结构的数据货物标
+
+# 生成无结构的数据货物标
 def postprocess_formatted1(section_content):
    # print(json.dumps(section_content, ensure_ascii=False, indent=4))
    """
@ -77,7 +80,8 @@ def postprocess_formatted1(section_content):

    return concatenated

-#-----------以下为直接从序号中提取，无大标题的相关函数----------
+
+# -----------以下为直接从序号中提取，无大标题的相关函数----------

 # 对于每个target_value元素，如果有完美匹配json_data中的键，那就加入这个完美匹配的键名，否则，把全部模糊匹配到的键名都加入
 def find_keys_by_value(target_value, json_data):
@ -92,6 +96,8 @@ def find_keys_by_value(target_value, json_data):
 def find_keys_with_prefix(key_prefix, json_data):
    subheadings = [k for k in json_data if k.startswith(key_prefix)]
    return subheadings
+
+
 def extract_json(data, target_values):
    results = {}
    for target_value in target_values:
@ -113,7 +119,8 @@ def extract_json(data, target_values):
                results[subkey] = data[subkey]
    return results

-#生成无结构的数据工程标,对提取出的若干键值对，生成外键为target_value，值为列表的新键值对
+
+# 生成无结构的数据工程标,对提取出的若干键值对，生成外键为target_value，值为列表的新键值对
 def postprocess_formatted2(data, target_values):
    """
    从输入字典中提取目标值对应的章节，并为每个子章节添加缩进。
@ -183,6 +190,7 @@ def postprocess_formatted2(data, target_values):

    return result

+
 def get_requirements_with_gpt(merged_baseinfo_path, selection):
    """
    根据 selection 的值选择相应的用户查询，并调用大模型获取要求。
@ -216,7 +224,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
        #          }
        #         }
        # """,
-        1:"""请根据提供的招标文件内容，提取其中有关投标文件的要求、内容，并以 JSON 格式返回结果。
+        1: """请根据提供的招标文件内容，提取其中有关投标文件的要求、内容，并以 JSON 格式返回结果。
    格式要求：
        - 外层键名应为文中提到的有关投标文件相关要求、内容的大项概览性标题。
        - 对于每个大项下的具体要求内容，有以下两种组织方式：
@ -271,7 +279,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
        #         }
        #     }
        # """,
-        2:"""该招标文件中有关开标、评标、定标、中标要求、内容(或磋商流程内容)是什么？请以 JSON 格式返回结果。
+        2: """该招标文件中有关开标、评标、定标、中标要求、内容(或磋商流程内容)是什么？请以 JSON 格式返回结果。
    格式要求：
        - 外层键名应为文中提到的有关开标、评标、定标、中标要求或磋商流程内容的大项概览性标题。
        - 对于每个大项下的子要求，使用嵌套的键值对进行组织。
@ -324,11 +332,11 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
    # 调用大模型并处理响应
    try:
        questions_to_continue = []
-        qianwen_res = qianwen_long_stream(file_id, user_query,2,1,True)
+        qianwen_res = qianwen_long_stream(file_id, user_query, 2, 1, True)
        message = qianwen_res[0]
        parsed = clean_json_string(message)
        total_tokens = qianwen_res[1]
-        if not parsed and total_tokens >5900:
+        if not parsed and total_tokens > 5900:
            questions_to_continue.append((user_query, message))
            if questions_to_continue:
                continued_results = process_continue_answers(questions_to_continue, 3, file_id)
@ -338,6 +346,7 @@ def get_requirements_with_gpt(merged_baseinfo_path, selection):
    except Exception as e:
        return {"error": "调用大模型失败"}

+
 # 读取JSON数据，提取内容，转换结构，并打印结果
 def extract_from_notice(merged_baseinfo_path, clause_path, type):
    """
@ -361,12 +370,12 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
    # 映射 type 到 target_values
    type_target_map = {
        1: ["投标", "投标文件", "响应文件"],
-        2: ["开标", "评标", "定标", "评审", "成交", "合同", "磋商","谈判","中标", "程序", "步骤"],
+        2: ["开标", "评标", "定标", "评审", "成交", "合同", "磋商", "谈判", "中标", "程序", "步骤"],
        3: ["重新招标、不再招标和终止招标", "重新招标", "重新采购", "不再招标", "不再采购", "终止招标", "终止采购"],
        4: ["评标"]  # 测试
    }
-
    # 获取对应 type 的 target_values
+    flag = (type == 2)
    target_values = type_target_map.get(type)
    if not target_values:
        print(f"Error: Invalid type specified: {type}. Use 1, 2, 3, or 4.")
@ -378,7 +387,7 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
                data = json.load(file)
            if len(data) >= 60:
                # 尝试使用大章节筛选
-                extracted_data = extract_between_sections(data, target_values)
+                extracted_data = extract_between_sections(data, target_values, flag)
                if extracted_data:
                    # 后处理并返回结果
                    extracted_data_concatenated = {
@ -402,12 +411,13 @@ def extract_from_notice(merged_baseinfo_path, clause_path, type):
        print(f"Error occurred while processing clause_path '{clause_path}': {e}")
        return DEFAULT_RESULT

+
 if __name__ == "__main__":
    # file_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\3bffaa84-2434-4bd0-a8ee-5c234ccd7fa0\\clause1.json'
-    merged_baseinfo_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp\merged_baseinfo_path_more.pdf"
-    clause_path=r"C:\Users\Administrator\Desktop\fsdownload\ec7d5328-9c57-450f-baf4-2e5a6f90ed1d\tmp\clause1.json"
+    merged_baseinfo_path = r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\ztbfile_merged_baseinfo.pdf"
+    clause_path = r"C:\Users\Administrator\Desktop\fsdownload\b29de31a-297e-42cf-b9ba-6859e530a472\clause1.json"
    try:
-        res = extract_from_notice(merged_baseinfo_path,clause_path, 2)  # 可以改变此处的 type 参数测试不同的场景
+        res = extract_from_notice(merged_baseinfo_path, clause_path, 1)  # 可以改变此处的 type 参数测试不同的场景
        res2 = json.dumps(res, ensure_ascii=False, indent=4)
        print(res2)
    except ValueError as e: