9.29投标人须知提取指定内容，结构化处理

2024-09-29 16:55:50 +08:00 · 2024-09-29 16:55:50 +08:00 · d0049861fe
commit d0049861fe
parent 6637086547
9 changed files with 354 additions and 100 deletions
--- a/flask_app/main/test.py
+++ b/flask_app/main/test.py
@ -1,19 +1,98 @@
 # -*- encoding:utf-8 -*-
 import json
 import re
 def post_process(value):
    # 如果传入的是非字符串值，直接返回原值
    if not isinstance(value, str):
        return value
    # 定义可能的分割模式及其正则表达式
    patterns = [
        (r'\d+、', r'(?=\d+、)'),       # 匹配 '1、'
        (r'[（(]\d+[）)]', r'(?=[（(]\d+[）)])'),  # 匹配 '(1)' 或 '（1）'
        (r'\d+\.', r'(?=\d+\.)'),        # 匹配 '1.'
        (r'[一二三四五六七八九十]、', r'(?=[一二三四五六七八九十]、)'),  # 匹配 '一、'、'二、' 等
        (r'[一二三四五六七八九十]\.', r'(?=[一二三四五六七八九十]\.)')  # 匹配 '一.'、'二.' 等
    ]
    # 初始化用于保存最早匹配到的模式及其位置
    first_match = None
    first_match_position = len(value)  # 初始值设为文本长度，确保任何匹配都会更新它
    # 遍历所有模式，找到第一个出现的位置
    for search_pattern, split_pattern_candidate in patterns:
        match = re.search(search_pattern, value)
        if match:
            # 如果这个匹配的位置比当前记录的更靠前，更新匹配信息
            if match.start() < first_match_position:
                first_match = split_pattern_candidate
                first_match_position = match.start()
    # 如果找到了最早出现的匹配模式，使用它来分割文本
    if first_match:
        blocks = re.split(first_match, value)
    else:
        # 如果没有匹配的模式，保留原文本
        blocks = [value]
    processed_blocks = []
    for block in blocks:
        if not block:
            continue
        # 计算中英文字符总数，如果大于50，则加入列表
        if block and len(re.findall(r'[\u4e00-\u9fff\w]', block)) >= 50:
            processed_blocks.append(block.strip())
        else:
            # 如果发现有块长度小于50，返回原数据
            print(block)
            return value
    # 如果所有的块都符合条件，返回分割后的列表
    return processed_blocks
 # 递归地处理嵌套结构
 def process_nested_data(data):
    # 递归遍历字典，处理最内层的字符串
    if isinstance(data, dict):
        # 如果当前项是字典，继续递归遍历其键值对
        result = {}
        for key, value in data.items():
            result[key] = process_nested_data(value)  # 递归处理子项
        return result
    elif isinstance(data, list):
        # 如果是列表，直接返回列表，保持原样
        return data
    else:
        # 到达最内层，处理非字典和非列表的元素（字符串）
        return post_process(data)
 # 测试数据
 # data = {
 #     '开标': {
 #         '开标': '1、黄石市公共资源电子交易平台远程不见面开标系统（以下简称：不见面开标系统）可以通过黄石市公共资源交易中心网站，黄石市工程建设电子交易系统（网址（https://gcjs.hsztbzx.com/login）进入不见面开投标人测试见面开标系统对投标人投标人测试见面开标系统对投标人标大厅，2、不见面开标系统对投标人测试测试见面开标系统对投标人测试见面开标系统对投标人测试见面开标系统对投标人测试见面开标系统对投标人测试见面投标人测试见面开标系统对投标人开标系统对投标人测试',
 #         '开标程序': [
 #             '主持人按下列程序在“电子交易平台”的“开标大厅”进行在线开标：（1）宣布开标纪律；（2）公布主持人、招标人代表、监标人等有关人员姓名；（3）公布在投标截止时间前投标文件的递交情况；（4）公布投标保证金递交情况；（5）投标人根据提示在投标人须知前附表规定的时间内解密投标文件；（6）读取已解密的投标文件的内容；（7）公布投标人名称、标段名称、投标保证金的递交情况、投标报价、质量目标、工期及其他内容，并生成开标记录；（8）开标结束。',
 #             '在本章第5.2.1（5）目规定的时间内，非因“电子交易平台”原因造成投标文件未解密的，视为投标人撤回投标文件。已解密的投标文件少于三个的，招标失败；已解密的投标文件不少于三个，开标继续进行。'
 #         ],
 #         '开标异议': [
 #             '投标人对开标有异议的，应当在开标过程中提出；招标人当场对异议作出答复，并记入开标记录。异议与答复应通过“开标大厅”在“异议与答复”菜单以书面形式进行。',
 #             '投标人异议成立的，招标人将及时采取纠正措施，或者提交评标委员会评审确认；投标人异议不成立的，招标人将当场给予解释说明。'
 #         ],
 #         '特殊情况的处置': [
 #             '1、黄石市公共资源电子交易平台远程不见面开标系统（以下简称：不见面开标系统）可以通过黄石市公共资源交易中心网站，黄石市工程建设电子交易系统（网址（https://gcjs.hsztbzx.com/login）进入不见面开标大厅，2、不见面开标系统对投标人测试测试见面开标系统对投标人测试见面开标系统对投标人测试见见面开标系统对投标人测试见面开标系面开标系统对投',
 #             '因“电子交易平台”系统故障导致无法正常开标的，招标人将暂停开标，待系统恢复正常后继续开标。',
 #             '“电子交易平台”系统故障是指下列情形：（1）系统服务器发生故障，无法访问或无法使用系统；（2）系统的软件或数据库出现错误，不能进行正常操作；（3）系统发现有安全漏洞，有潜在的泄密危险；（4）出现断电、断网事故；（5）其他无法保证招投标过程正常进行的情形。'
 #         ],
 #         '投标报价不参加评标基准价计算的情况': '若招标人发现投标文件出现投标人须知前附表中规定的投标报价不参加评标基准价计算的情况，经监标人确认后招标人将如实记录，其投标报价不参与评标基准价的计算，并提交评标委员会评审。'
 #     }
 # }
 data={
 '开标': '1、黄石市公共资源电子交易平台远程不见面开标系统（以下简称：不见面开标系统）可以通过黄石市公共资源交易中心网站，黄石市工程建设电子交易系统（网址（https://gcjs.hsztbzx.com/login）进入不见面开投标人测试见面开标系统对投标人投标人测试见面开标系统对投标人标大厅，2、不见面开标系统对投标人测试测试见面开标系统对投标人测试见面开标系统对投标人测试见面开标系统对投标人测试见面开标系统对投标人测试见面投标人测试见面开标系统对投标人开标系统对投标人测试'
 }
 # 调用函数处理嵌套数据结构
 result = process_nested_data(data)
 print(json.dumps(result,ensure_ascii=False,indent=4))
 from flask_app.main.json_utils import extract_content_from_json
 from flask_app.main.多线程提问 import multi_threading
 from flask_app.货物标.资格审查main import update_json_data
 ques=["关于'资格要求',本采购文件第一章第二款要求的内容是怎样的？请按json格式给我提供信息，键名为'资格性审查.资格要求'，你不能回答而键值需要完全与原文保持一致，不要擅自总结、删减，如果存在未知信息，请在对应键值处填'未知'。"]
 knowledge_name="6.2视频会议docx"
 results = multi_threading(ques, knowledge_name)  # 无序号的直接问大模型
 first_response_list = []
 for _, response in results:
        try:
            if response and len(response) > 1:  # 检查response存在且有至少两个元素
                temp = extract_content_from_json(response[1])
                first_response_list.append(temp)
            else:
                print(f"形式响应评审：Warning: Missing or incomplete response data for query index {_}.")
        except Exception as e:
            print(f"形式响应评审：Error processing response for query index {_}: {e}")
 print(first_response_list)
--- a/flask_app/main/商务标技术标整合.py
+++ b/flask_app/main/商务标技术标整合.py
@ -1,3 +1,4 @@
 # -*- encoding:utf-8 -*-
 import json
 from flask_app.main.json_utils import clean_json_string
@ -100,6 +101,8 @@ def combine_evaluation_standards(truncate2):
    file_id = upload_file(truncate2)
    user_query_2 = (
        "根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，若对应内容中存在其他信息，在键名如'技术评分'中新增子键名'备注'存放该信息。如果评分内容（因素）不是这3个，则返回文档中给定的评分内容（因素）以及它的评分要求。请以json格式返回结果，不要回答有关形式、资格、响应性评审标准的内容")
    # user_query_2 = (
    #         "根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，请以json格式返回结果，请在这三大块评分中分别用若干键值对表示评分要求，最内层的键名为'分数'及'要求'，若这三大块评分中存在其他信息，则在相应评分大块中新增嵌套键名'备注'存放该信息，键值为具体的要求，否则不需要。如果评分内容（因素）不是这3个，则返回文档中给定的评分内容（因素）以及它的评分要求。不要回答有关形式、资格、响应性评审标准的内容")
    evaluation_res = qianwen_long(file_id, user_query_2)
    target_values1 = ['技术标','技术部分','设计', '实施',"技术评分"]
    # target_values2=['投标报价','商务标','商务部分','报价部分','业绩','信誉','分值','计算公式','信用','人员','资格','奖项','认证','荣誉']
--- a/flask_app/main/投标人须知正文提取指定内容.py
+++ b/flask_app/main/投标人须知正文提取指定内容.py
@ -16,7 +16,7 @@ def find_keys_with_prefix(key_prefix, json_data):
    return subheadings
-# 从文件中读取JSON数据，并提取特定内容
+# 从完整的json文件中读取所需数据，eg:投标、评标
 def extract_json(data, target_values):
    results = {}
    for target_value in target_values:
@ -63,7 +63,7 @@ def transform_json(data):
    result = {}
    temp = {0: result}  # 初始化根字典
-    # 首先，创建一个临时字典用于检查是否存在三级标题
+    # 首先,创建一个临时字典用于检查是否存在三级标题
    has_subkey = {}
    for key in data.keys():
        parts = key.split('.')
@ -81,49 +81,112 @@ def transform_json(data):
                print(f"No parent found at level {len(levels) - 1} for key '{key}'. Check the data structure.")
                continue
-            if len(levels) == len(match.groups()):
+            if len(levels) == 1:  # 一级标题
-                if isinstance(parent, list):
+                new_key, *new_value = value.split('\n', 1)
                new_key = new_key.strip()
                new_value = new_value[0].strip() if new_value else ""
                parent[new_key] = {}
                if new_value:
                    parent[new_key][new_key] = new_value  # 使用 new_key 作为键名，而不是固定的 "content"
                temp[len(levels)] = parent[new_key]
            elif len(levels) == 2:  # 二级标题
                new_key, *new_value = value.split('\n', 1)
                new_key = new_key.strip()
                new_value = new_value[0].strip() if new_value else ""
                if f"{levels[0]}.{levels[1]}" in has_subkey:
                    parent[new_key] = [new_value] if new_value else []
                else:
                    parent[new_key] = new_value
                temp[len(levels)] = parent[new_key]
            else:  # 三级标题
                if isinstance(parent, dict):
                    parent_key = list(parent.keys())[-1]
                    if isinstance(parent[parent_key], list):
                        parent[parent_key].append(value)
                    elif parent[parent_key]:
                        parent[parent_key] = [parent[parent_key], value]
                    else:
                        parent[parent_key] = [value]
                elif isinstance(parent, list):
                    parent.append(value)
                else:
                    # 对于根级别，使用完整的值作为键
                    if len(levels) == 1:
                        parent[value] = {}
                        temp[len(levels)] = parent[value]
                    else:
                        parent[value.split()[0]] = value
            else:
                new_key = value
                if '\n' in value and len(levels) == 2 and f"{levels[0]}.{levels[1]}" not in has_subkey:
                    new_key, new_value = value.split('\n', 1)
                    new_key = new_key.strip()
                    new_value = new_value.strip()
                    if isinstance(parent, list):
                        if len(parent) == 0 or not isinstance(parent[-1], dict):
                            parent.append({})
                        parent[-1][new_key] = new_value
                    else:
                        parent[new_key] = new_value
                else:
                    if isinstance(parent, list):
                        if len(parent) == 0 or not isinstance(parent[-1], dict):
                            parent.append({})
                        parent = parent[-1]
                    if new_key not in parent:
                        parent[new_key] = []
                    temp[len(levels)] = parent[new_key]
    def remove_single_item_lists(node):
        if isinstance(node, dict):
            for key in list(node.keys()):
                node[key] = remove_single_item_lists(node[key])
-                if isinstance(node[key], list) and not node[key]:
+                if isinstance(node[key], list) and len(node[key]) == 1:
-                    node[key] = ""
+                    node[key] = node[key][0]
        elif isinstance(node, list) and len(node) == 1:
            return remove_single_item_lists(node[0])
        return node
    return remove_single_item_lists(result)
 def post_process(value):
    # 如果传入的是非字符串值，直接返回原值
    if not isinstance(value, str):
        return value
    # 定义可能的分割模式及其正则表达式
    patterns = [
        (r'\d+、', r'(?=\d+、)'),       # 匹配 '1、'
        (r'[（(]\d+[）)]', r'(?=[（(]\d+[）)])'),  # 匹配 '(1)' 或 '（1）'
        (r'\d+\.', r'(?=\d+\.)'),        # 匹配 '1.'
        (r'[一二三四五六七八九十]、', r'(?=[一二三四五六七八九十]、)'),  # 匹配 '一、'、'二、' 等
        (r'[一二三四五六七八九十]\.', r'(?=[一二三四五六七八九十]\.)')  # 匹配 '一.'、'二.' 等
    ]
    # 初始化用于保存最早匹配到的模式及其位置
    first_match = None
    first_match_position = len(value)  # 初始值设为文本长度，确保任何匹配都会更新它
    # 遍历所有模式，找到第一个出现的位置
    for search_pattern, split_pattern_candidate in patterns:
        match = re.search(search_pattern, value)
        if match:
            # 如果这个匹配的位置比当前记录的更靠前，更新匹配信息
            if match.start() < first_match_position:
                first_match = split_pattern_candidate
                first_match_position = match.start()
    # 如果找到了最早出现的匹配模式，使用它来分割文本
    if first_match:
        blocks = re.split(first_match, value)
    else:
        # 如果没有匹配的模式，保留原文本
        blocks = [value]
    processed_blocks = []
    for block in blocks:
        if not block:
            continue
        # 计算中英文字符总数，如果大于50，则加入列表
        if block and len(re.findall(r'[\u4e00-\u9fff\w]', block)) >= 50:
            processed_blocks.append(block.strip())
        else:
            # 如果发现有块长度小于50，返回原数据
            print(block)
            return value
    # 如果所有的块都符合条件，返回分割后的列表
    return processed_blocks
 # 递归地处理嵌套结构
 def process_nested_data(data):
    # 递归遍历字典，处理最内层的字符串
    if isinstance(data, dict):
        # 如果当前项是字典，继续递归遍历其键值对
        result = {}
        for key, value in data.items():
            result[key] = process_nested_data(value)  # 递归处理子项
        return result
    elif isinstance(data, list):
        # 如果是列表，直接返回列表，保持原样
        return data
    else:
        # 到达最内层，处理非字典和非列表的元素（字符串）
        return post_process(data)
 # 读取JSON数据，提取内容，转换结构，并打印结果
 def extract_from_notice(clause_path, type):
@ -133,6 +196,8 @@ def extract_from_notice(clause_path, type):
        target_values = ["开标", "评标", "定标"]
    elif type == 3:
        target_values = ["重新招标、不再招标和终止招标", "重新招标", "不再招标", "终止招标"]
    elif type == 4:
        target_values = ["开标"]   #测试
    else:
        raise ValueError("Invalid type specified. Use 1 for '投标文件, 投标' or 2 for '开标, 评标, 定标'or 3 for '重新招标'")
    with open(clause_path, 'r', encoding='utf-8') as file:
@ -140,15 +205,15 @@ def extract_from_notice(clause_path, type):
        extracted_data = extract_json(data, target_values)  # 读取json
        sorted_data=sort_clean_data_keys(extracted_data)    #对键进行排序
        transformed_data = transform_json(sorted_data)
-        return transformed_data
+        final_result=process_nested_data(transformed_data)
        return final_result
-
+#TODO: 再审视一下zbtest20的处理是否合理
 #TODO:考虑5.1这种二级标题后面换行符，但是仍然存在5.1.1情况时该怎么办  再审视一下zbtest20的处理是否合理
 if __name__ == "__main__":
    file_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\tmp\\clause1.json'
    # file_path='C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\clause1.json'
    try:
-        res = extract_from_notice(file_path, 2)  # 可以改变此处的 type 参数测试不同的场景
+        res = extract_from_notice(file_path, 4)  # 可以改变此处的 type 参数测试不同的场景
        res2=json.dumps(res,ensure_ascii=False,indent=4)
        print(res2)
    except ValueError as e:
--- a/flask_app/main/投标人须知正文条款提取成json文件.py
+++ b/flask_app/main/投标人须知正文条款提取成json文件.py
@ -130,11 +130,11 @@ def parse_text_by_heading(text):
                    # 将之前的内容保存到data中，保留第一个换行符，后续的换行符转为空字符
                    content_string = ''.join(current_content).strip()
                    data[current_key] = content_string.replace(' ', '')
                current_key = new_key
                current_content = [line_content]
                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True
-                append_newline = len(new_key.split('.')) == 2
+                # append_newline = len(new_key.split('.')) == 2
                append_newline = len(new_key.rstrip('.').split('.')) <= 2
            else:
                append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
        else:
@ -184,46 +184,46 @@ def convert_clause_to_json(input_path,output_folder,type=1):
    output_path = os.path.join(output_folder, file_name)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=4, ensure_ascii=False)
-    post_process_json(output_path)
+    # post_process_json(output_path)
    print(f"投标人须知正文条款提取成json文件: The data has been processed and saved to '{output_path}'.")
    return output_path
-def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内容 zbtest20
+# def post_process_json(json_file_path):   #处理一级标题如'5.1'过长的内容 zbtest20
-    # 读取 JSON 文件
+#     # 读取 JSON 文件
-    with open(json_file_path, 'r', encoding='utf-8') as file:
+#     with open(json_file_path, 'r', encoding='utf-8') as file:
-        data = json.load(file)
+#         data = json.load(file)
-    processed_data = {}
+#     processed_data = {}
-    for key, value in data.items():
+#     for key, value in data.items():
-        # 检查是否是一级标题（如 '5.'），并且其值包含 '\n'
+#         # 检查是否是一级标题（如 '5.'），并且其值包含 '\n'
-        if re.match(r'^\d+\.\s*$', key) and '\n' in value:
+#         if re.match(r'^\d+\.\s*$', key) and '\n' in value:
-            # 分割标题和正文
+#             # 分割标题和正文
-            title, content = value.split('\n', 1)
+#             title, content = value.split('\n', 1)
-
+#
-            # 添加原来的标题作为 '5.0'，其值为原来标题的内容（即 title）
+#             # 添加原来的标题作为 '5.0'，其值为原来标题的内容（即 title）
-            processed_data[key] = title.strip()
+#             processed_data[key] = title.strip()
-            sub_key = f"{key.rstrip('.')}." + "0"  # 自动生成 '5.0'，与 '5.' 一致，保证点号的存在
+#             sub_key = f"{key.rstrip('.')}." + "0"  # 自动生成 '5.0'，与 '5.' 一致，保证点号的存在
-
+#
-            processed_data[sub_key] = title.strip()
+#             processed_data[sub_key] = title.strip()
-
+#
-            # 初始化计数器
+#             # 初始化计数器
-            sub_count = 1
+#             sub_count = 1
-
+#
-            # 根据子序号 '1.' 或 '1、' 进行分割
+#             # 根据子序号 '1.' 或 '1、' 进行分割
-            sub_sections = re.split(r'(\d+[\.\、])\s*', content)
+#             sub_sections = re.split(r'(\d+[\.\、])\s*', content)
-
+#
-            current_sub_content = ""
+#             current_sub_content = ""
-            for i in range(1, len(sub_sections), 2):
+#             for i in range(1, len(sub_sections), 2):
-                sub_number = sub_sections[i].strip()  # 获取子序号
+#                 sub_number = sub_sections[i].strip()  # 获取子序号
-                sub_content = sub_sections[i + 1].strip()  # 获取内容
+#                 sub_content = sub_sections[i + 1].strip()  # 获取内容
-
+#
-                # 生成三级标题，如 '5.0.1', '5.0.2'
+#                 # 生成三级标题，如 '5.0.1', '5.0.2'
-                sub_key_with_number = f"{sub_key}.{sub_count}"
+#                 sub_key_with_number = f"{sub_key}.{sub_count}"
-                processed_data[sub_key_with_number] = sub_content
+#                 processed_data[sub_key_with_number] = sub_content
-                sub_count += 1
+#                 sub_count += 1
-
+#
-        else:
+#         else:
-            # 如果没有分割需求，保留原数据
+#             # 如果没有分割需求，保留原数据
-            processed_data[key] = value
+#             processed_data[key] = value
    # 将修改后的数据重新写入到原来的 JSON 文件中
    with open(json_file_path, 'w', encoding='utf-8') as file:
--- a/flask_app/main/文档理解大模型版/调用文件解析状态查询.py
+++ b/flask_app/main/文档理解大模型版/调用文件解析状态查询.py
@ -0,0 +1,33 @@
 from typing import List
 from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
 from alibabacloud_tea_util.client import Client as UtilClient
 from alibabacloud_credentials.client import Client as CredClient
 def query():
  	# 使用默认凭证初始化Credentials Client。
    cred=CredClient()
    config = open_api_models.Config(
        # 通过credentials获取配置中的AccessKey ID
        cred.get_credential().access_key_id,
        # 通过credentials获取配置中的AccessKey Secret
        cred.get_credential().access_key_secret,
    )
    # 访问的域名
    config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
    client = docmind_api20220711Client(config)
    request = docmind_api20220711_models.QueryDocParserStatusRequest(
        # id :  任务提交接口返回的id
        id='docmind-20240929-befe0a1a529b4640b5fbf51ff359a272'
    )
    try:
        # 复制代码运行请自行打印 API 的返回值
        response = client.query_doc_parser_status(request)
        # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头
        # 获取返回结果。建议先把response.body.data转成json，然后再从json里面取具体需要的值。
        print(response.body.data.status)
    except Exception as error:
        # 如有需要，请打印 error
        UtilClient.assert_as_string(error.message)
 query()
--- a/flask_app/main/文档理解大模型版/调用文档解析异步提交.py
+++ b/flask_app/main/文档理解大模型版/调用文档解析异步提交.py
@ -0,0 +1,40 @@
 from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
 from alibabacloud_tea_util.client import Client as UtilClient
 from alibabacloud_tea_util import models as util_models
 from alibabacloud_credentials.client import Client as CredClient
 def submit_file():
  	# 使用默认凭证初始化Credentials Client。
    cred=CredClient()
    config = open_api_models.Config(
        # 通过credentials获取配置中的AccessKey ID
        # access_key_id=cred.get_access_key_id(),
        cred.get_credential().access_key_id,
        # 通过credentials获取配置中的AccessKey Secret
        # access_key_secret=cred.get_access_key_secret()
        cred.get_credential().access_key_secret,
    )
    # 访问的域名
    config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
    client = docmind_api20220711Client(config)
    request = docmind_api20220711_models.SubmitDocParserJobAdvanceRequest(
        # file_url_object : 本地文件流
        file_url_object=open("./zbtest4.pdf", "rb"),
        # file_name ：文件名称。名称必须包含文件类型
        file_name='zbtest4.pdf'
        # file_name_extension : 文件后缀格式。与文件名二选一
        # file_name_extension='pdf'
    )
    runtime = util_models.RuntimeOptions()
    try:
        # 复制代码运行请自行打印 API 的返回值
        response = client.submit_doc_parser_job_advance(request, runtime)
        # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。如下示例为打印返回的业务id格式
        # 获取属性值均以小写开头，
        print(response.body.data.id)
    except Exception as error:
        # 如有需要，请打印 error
        UtilClient.assert_as_string(error.message)
 submit_file()
--- a/flask_app/main/文档理解大模型版/调用文档解析结果获取.py
+++ b/flask_app/main/文档理解大模型版/调用文档解析结果获取.py
@ -0,0 +1,35 @@
 from typing import List
 from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
 from alibabacloud_tea_util.client import Client as UtilClient
 from alibabacloud_credentials.client import Client as CredClient
 def query():
  	# 使用默认凭证初始化Credentials Client。
    cred=CredClient()
    config = open_api_models.Config(
        # 通过credentials获取配置中的AccessKey ID
        cred.get_credential().access_key_id,
        # 通过credentials获取配置中的AccessKey Secret
        cred.get_credential().access_key_secret,
    )
    # 访问的域名
    config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
    client = docmind_api20220711Client(config)
    request = docmind_api20220711_models.GetDocParserResultRequest(
        # id :  任务提交接口返回的id
        id='docmind-20240929-befe0a1a529b4640b5fbf51ff359a272',
        layout_step_size=10,
        layout_num=0
    )
    try:
        # 复制代码运行请自行打印 API 的返回值
        response = client.get_doc_parser_result(request)
        # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头
        # 获取返回结果。建议先把response.body.data转成json，然后再从json里面取具体需要的值。
        print(response.body.data)
    except Exception as error:
        # 如有需要，请打印 error
        UtilClient.assert_as_string(error.message)
 query()
--- a/flask_app/main/读取文件/按页读取pdf.py
+++ b/flask_app/main/读取文件/按页读取pdf.py
@ -72,6 +72,6 @@ def extract_text_by_page(file_path):
                print(f"Page {page_num + 1} is empty or text could not be extracted.")
        return result
 if __name__ == '__main__':
-    file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件（一中多媒体报告厅教学设备）_tobidders_notice_part2.pdf"
+    file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20_tobidders_notice.pdf"
    res=extract_text_by_page(file_path)
    # print(res)
--- a/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
+++ b/flask_app/货物标/投标人须知正文条款提取成json文件货物标版.py
@ -120,9 +120,8 @@ def parse_text_by_heading(text):
                    data[current_key] = content_string.replace(' ', '')
                current_key = new_key
                current_content = [line_content]
-                # 只有当标题级别为两级（如 1.1）时，才设置 append_newline 为 True   #TODO:一级标题也是
+                # 当标题级别为一级（1.）和两级（如 1.1）时标题，才设置 append_newline 为 True
-                append_newline = len(new_key.split('.')) == 2
+                append_newline = len(new_key.rstrip('.').split('.')) <= 2
                print(new_key)
            else:
                append_newline = handle_content_append(current_content, line_content, append_newline, keywords)
        else:
@ -175,7 +174,7 @@ def convert_clause_to_json(input_path,output_folder,type=1):
 if __name__ == "__main__":
    # file_path = 'D:\\flask_project\\flask_app\\static\\output\\cfd4959d-5ea9-4112-8b50-9e543803f029\\ztbfile_tobidders_notice.pdf'
-    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\招招招标文件（一中多媒体报告厅教学设备）_tobidders_notice_part2.pdf'
+    file_path='C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_tobidders_notice_part2.pdf'
    output_folder = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp1'
    try:
        output_path = convert_clause_to_json(file_path,output_folder)