zbparse/flask_app/工程标/形式响应评审.py

# -*- encoding:utf-8 -*-
import os
import re
import json
import time

from flask_app.general.多线程提问 import multi_threading
from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
from flask_app.general.json_utils import clean_json_string
from flask_app.工程标.提取json工程标版 import convert_clause_to_json
from flask_app.general.通义千问long import upload_file
from flask_app.general.merge_pdfs import merge_pdfs
prompt = """
# 角色
你是一个文档处理专家，专门负责理解和操作基于特定内容的文档任务，这包括解析、总结、搜索或生成与给定文档相关的各类信息。

## 技能
### 技能 1：文档解析与摘要
- 深入理解并分析${document1}的内容，提取关键信息。
- 根据需求生成简洁明了的摘要，保持原文核心意义不变。

### 技能 2：信息检索与关联
- 在${document1}中高效检索特定信息或关键词。
- 能够识别并链接到文档内部或外部的相关内容，增强信息的连贯性和深度。

## 限制
- 所有操作均需基于${document1}的内容，不可超出此范围创造信息。
- 在处理敏感或机密信息时，需遵守严格的隐私和安全规定。
- 确保所有生成或改编的内容逻辑连贯，无误导性信息。

请注意，上述技能执行时将直接利用并参考${document1}的具体内容，以确保所有产出紧密相关且高质量。
"""


def update_json_data(original_data, updates1, updates2,second_response_list):
    """
    根据提供的更新字典覆盖原始JSON数据中对应的键值，支持点分隔的键来表示嵌套结构。
    参数:
    - original_data: dict, 原始的JSON数据。
    - updates: dict, 包含需要更新的键值对。
    - second_response_list: list, 包含多个字典，每个字典包含需要更新的键值对。
    返回:
    - updated_data: dict, 更新后的JSON数据。
    """
    def recursive_update(data, key, value):
        # 处理点分隔的键，递归定位并更新嵌套字典
        keys = key.split('.')
        for k in keys[:-1]:
            data = data.setdefault(k, {})
        if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
            data[keys[-1]] = {**data.get(keys[-1], {}), **value}
        else:
            data[keys[-1]] = value

    # 合并 updates 到 original_data 中
    for key, value in updates1.items():
        recursive_update(original_data, key, value)
    for key,value in updates2.items():
        recursive_update(original_data, key, value)

    # 遍历 second_response_list 中的每个字典，并合并到 original_data 中
    for response_dict in second_response_list:
        for key, value in response_dict.items():
            recursive_update(original_data, key, value)

    return original_data


def judge_second_jump(input_dict):
    formatted_questions = []
    keys_to_remove = []
    chapter_pattern = re.compile(r'招\s*标\s*公\s*告|第\s*一\s*章')

    question_template = (
        "请你根据招标文件中的内容回答：关于{short_key}{additional_text}，第一章招标公告中的内容是怎样的？"
        "请按json格式给我提供信息，键名为'{full_key}'，而键值需要完全与原文保持一致，不要擅自总结、删减，"
        "如果存在未知信息，请在对应键值处填'未知'。"
    )
    for key, value in input_dict.items():
        if chapter_pattern.search(value):
            short_key = key.split('.')[-1] if '.' in key else key
            additional_text = get_additional_text(key)
            formatted_entry = question_template.format(
                short_key=short_key,
                full_key=key,
                additional_text=additional_text
            )
            formatted_questions.append(formatted_entry)
            keys_to_remove.append(key)  # 标记需要删除的键

    # 从原始字典中删除处理过的键值对
    for key in keys_to_remove:
        del input_dict[key]

    return input_dict, formatted_questions

def extract_matching_keys(original_data):
    """
    提取带序号的内容，对于已经完整的内容不提取，但会在后面用新数据更新orginal_data
    :param json_data:
    {
  "投标要求": {
    "资质要求": "符合第3章第2条规定",
    "业绩要求": "满足招标文件第4章相关条目"
  },
  "其他信息": {
    "投标文件格式": "按要求填写"
  }
}
    :return:
    [
        {"投标要求.资质要求": "符合第3章第2条规定"},
        {"投标要求.业绩要求": "满足招标文件第4章相关条目"}
    ]
    """
    #'投标人名称': '符合招标公告第3.2条规定'
    # 函数首先检查输入 json_data 是否为字符串类型。如果是，它会使用 json.loads() 将字符串解析为字典。
    if isinstance(original_data, str):
        data = json.loads(original_data)
    else:
        data = original_data

    # 正则表达式匹配
    include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
    additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
    exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
    additional_exclude_patterns = ["按要求", "按规定"]

    # Initialize a list to hold filtered key-value pairs
    final_matching = []

    # Recursive function to traverse and filter data
    def recursive_search(current_data, path=[]):
        if isinstance(current_data, dict):
            for key, value in current_data.items():
                new_path = path + [key]  # Update path for nested keys
                if isinstance(value, (dict, list)):
                    recursive_search(value, new_path)
                else:
                    process_value(key, str(value), new_path)
        elif isinstance(current_data, list):
            for item in current_data:
                recursive_search(item, path)

    # Function to process each value against the patterns
    def process_value(key, value, path):
        # Check exclude patterns first
        if any(ex in key or ex in value for ex in exclude_patterns):
            return
        # Main include patterns
        if any(pattern.search(value) for pattern in include_patterns):
            # Additional exclude patterns
            if not any(ex in key or ex in value for ex in additional_exclude_patterns):
                # Additional include patterns
                if any(pattern.search(value) for pattern in additional_include_patterns):
                    final_matching.append({".".join(path): value})  # Use dot notation for nested keys

    # Start the recursive search
    recursive_search(data)

    return final_matching

    #同类表达扩展
def get_additional_text(key):
    additional_text_map = {
        "投标内容": "（或招标范围）",
        "工程质量": "（或质量要求）",
        # 可以在这里添加更多的映射
    }
    for keyword, text in additional_text_map.items():
        if keyword in key:
            return text
    return ""


def process_tender_entries(match_keys, output_folder, truncate0_jsonpath, clause_path):
    """
    处理招标条目，根据提供的匹配键生成不同的输出。

    :param match_keys: 匹配键列表，每个键是一个字典，格式如下：
        [
            {"资质要求": "符合相关资质要求"},
            {"业绩要求": "3.1.2 近三年完成类似项目"},
            {"财务要求": "招标公告 2.3 财务状况良好"},
            {"人员要求": "项目经理具有相关资格"}
        ]
    :param output_folder: 输出文件夹路径
    :param truncate0_jsonpath: 截断 JSON 路径
    :param clause_path: 条款路径
    :return:
        - entries_with_numbers: 包含数字信息的条目列表（可能为空）
        - formatted_questions: 格式化的问题列表
        - entries_with_numbers2: 包含数字信息的第二类条目列表（可能为空）
        - clause2_path: 第二类条目的 JSON 路径（可能为空）
    """
    truncate0_empty = not truncate0_jsonpath or not os.path.exists(truncate0_jsonpath)
    clause_empty = not clause_path or not os.path.exists(clause_path)
    both_empty = truncate0_empty and clause_empty
    entries_with_numbers = []
    entries_with_numbers2 = []
    formatted_questions = []
    clause2_path = ""

    # 正则表达式，同时匹配全角和半角括号
    pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[$\（](\d+)[$\）])?')

    # 新的正则表达式，用于匹配 '招标公告' 和 '第一章'，允许内部有空格
    chapter_pattern = re.compile(r'招\s*标\s*公\s*告|第\s*一\s*章')

    question_template = (
        "请你根据招标文件中的内容回答：关于{short_key}{additional_text}，{revised_value}的内容是怎样的？"
        "请按json格式给我提供信息，键名为'{full_key}'，而键值需要完全与原文保持一致，不要擅自总结、删减，"
        "如果存在未知信息，请在对应键值处填'未知'。"
    )

    def format_num(match):
        """格式化匹配到的数字"""
        return match.group(1) + (f"({match.group(2)})" if match.group(2) else "")

    def add_formatted_question(key, value):
        """根据键值生成格式化的问题并添加到 formatted_questions 列表中"""
        revised_value = value.replace('符合', '')
        short_key = key.split('.')[-1] if '.' in key else key
        additional_text = get_additional_text(key)  # 确保该函数已定义
        formatted_questions.append(question_template.format(
            short_key=short_key,
            full_key=key,
            revised_value=revised_value,
            additional_text=additional_text
        ))
        # 标记是否已经生成 clause2_path
    clause2_generated = False
    for entry in match_keys:
        key, value = next(iter(entry.items()))
        match = pattern.search(value)

        if chapter_pattern.search(value):         #招标公告
            if match:
                if not clause2_generated:
                    notice_path = next(
                        (os.path.join(root, file) for root, _, files in os.walk(output_folder)
                         for file in files if file.endswith('_notice.pdf')), ""
                    )
                    if notice_path:
                        clause2_path = convert_clause_to_json(notice_path, output_folder, 2)  # 确保该函数已定义
                        clause2_generated = True  # 标记已经生成
                        entries_with_numbers2.append({key: format_num(match)})
                    else:
                        add_formatted_question(key, value)
                else:
                    # 如果已经生成 clause2_path，直接添加到 entries_with_numbers2
                    entries_with_numbers2.append({key: format_num(match)})
            else:
                add_formatted_question(key, value)
        elif match:
            if both_empty:
                add_formatted_question(key, value)
            else:
                entries_with_numbers.append({key: format_num(match)})
        else:
            add_formatted_question(key, value)

    # 如果 clause2_path 为空，不生成 entries_with_numbers2
    if not clause2_path:
        entries_with_numbers2 = []

    return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path

def fetch_specific_pdf(output_folder):
    pdf_paths = []
    baseinfo_found = False

    # 遍历指定目录
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            # 检查是否找到 merged_baseinfo.pdf
            if file.endswith('merged_baseinfo.pdf'):
                baseinfo_found = True
                full_path = os.path.join(root, file)
                pdf_paths.append(full_path)
            elif file.endswith('tobidders_notice.pdf'):
                # 将找到的 tobidders_notice.pdf 添加到列表中
                full_path = os.path.join(root, file)
                pdf_paths.append(full_path)
            elif not baseinfo_found and file.endswith('invalid.pdf'):
                # 如果 merged_baseinfo.pdf 不存在，则返回以 invalid.pdf 结尾的文件路径
                return os.path.join(root, file)

    if baseinfo_found and pdf_paths:
        # 如果找到了 merged_baseinfo.pdf 和其他文件，则进行合并
        output_path = os.path.join(output_folder, "merged_reviews.pdf")
        output_path=merge_pdfs(pdf_paths, output_path)
        return output_path
    else:
        # 如果未找到 merged_baseinfo.pdf 且没有 invalid.pdf 匹配
        return ""


def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, clause_json_path):
    combined_results2 = {}
    first_response_list = []
    matched_keys = extract_matching_keys(original_dict_data)

    entries_with_numbers, formatted_questions1, entries_with_numbers2, clause2_path = process_tender_entries(matched_keys, output_folder,truncate0_jsonpath,clause_json_path)

    combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path)   #返回值可能为{}

    combined_results1, formatted_questions2 = judge_second_jump(combined_results)     #返回值可能为{} []

    if entries_with_numbers2:
        combined_results2 = process_and_merge2(entries_with_numbers2, clause2_path)

    formatted_questions = formatted_questions1 + formatted_questions2
    if formatted_questions:
        output_path = fetch_specific_pdf(output_folder)  #合并merged_info
        if output_path:
            file_id = upload_file(output_path)
            results = multi_threading(formatted_questions, "", file_id, 2)
            first_response_list = [clean_json_string(res) for _, res in results] if results else []
            print(first_response_list)

    updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list)
    return updated_json

#TODO:现在是clause truncate_jsonpath非空就尝试从中提取，但是有可能从里面提取失败，这个时候可能需要重新问大模型。
if __name__ == "__main__":
    start_time=time.time()
    # knowledge_name="zbfile"
    truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\3395fc1b-432d-407d-a872-fc35e8475aef\\truncate_output.json"
    clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\3395fc1b-432d-407d-a872-fc35e8475aef\\clause1.json"
    original_dict_data={
        "形式评审标准": {
            "投标文件": "投标文件能正常打开",
            "投标人名称": "与营业执照、资质证书、安全生产许可证一致",
            "投标文件签字盖章": "符合第二章“投标人须知”第 3.7.3（3）目规定",
            "投标文件格式、内容": "符合第八章“投标文件格式”的要求，实质性内容齐全、关键字迹清晰可辨",
            "联合体投标人（如有）": "提交联合体协议书，并明确联合体牵头人",
            "报价唯一": "只能有一个有效报价（指投标函中的大写报价）",
            "多标段投标": "符合第一章规定",
            "“技术暗标”": "符合第二章“投标人须知”第 3.7.4（5）目规定"
        },
        "响应性评审标准": {
            "投标内容": "符合第二章“投标人须知”第 1.3.1 项规定",
            "工期": "符合第二章“投标人须知”第 1.3.2 项规定",
            "工程质量": "符合招标公告投标人须知”第 1.3.3 项规定",
            "投标有效期": "符合第二章“投标人须知”第 3.3.1 项规定",
            "投标保证金": "符合第二章“投标人须知”规定",
            "权利义务": "投标函附录中的相关承诺符合或优于第四章“合同条款及格式”的相关规定",
            "技术标准和要求": "符合第七章“技术标准和要求”规定",
            "分包计划": "符合第二章“投标人须知”规定",
            "算术错误修正": "1)投标人接受算术错误修正后的报价 2)修正后的报价与投标报价相比偏差率不超过±1%",
            "投标价格": "1）投标函中的大写报价与已标价工程量清单中的投标总价一致 2）投标函中的大写报价不大于本标段招标控制价总价 3）算术错误修正后的投标总报价不大于本标段招标控制价总价 4）投标报价不低于其成本",
            "已标价工程量清单": "1）已标价工程量清单项目编码顺序与第五章“工程量清单”给出的项目编码顺序一致； 2）已标价工程量清单符合第五章“工程量清单”给出的项目编码、项目名称、项目特征、计量单位和工程量； 3）暂列金额符合第五章“工程量清单”列出的金额； 4）专业工程暂估价符合第五章“工程量清单”列出的金额； 5）材料（工程设备）暂估价符合第五章“工程量清单”列出的单价并计入综合单价； 6）安全文明施工费、规费和税金等不可竞争费用，按照规定的标准计取； 7）计税方法符合招标文件的约定； 8）已标价工程量清单项目未填报的项目不超过三项，或不超过三项未填报的项目的费用合计（按招标控制价相应项目的费用合计计算）不超过其投标总报价（修正后的投标总报价，如有）的±1%"
        }
    }
    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test"
    # notice_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_notice.pdf"
    formal_json = process_reviews(original_dict_data,output_folder, truncate_tobidders_table_json_path, "")
    data = json.dumps(formal_json, ensure_ascii=False, indent=4)
    end_time=time.time()
    elapsed_time = end_time - start_time
    print(f"Function execution took {elapsed_time} seconds.")
    print(data)

#关于'技术暗标'，第二章“投标人须知”规定的内容是怎样的？请按json格式给我提供信息，键名为'技术暗标',请你忠于原文，回答要求完整准确，不要擅自总结、删减，且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。
-.5 资格审查部分返回更能对应原文

											
										
										
											2024-09-05 18:00:40 +08:00
+								# -*- encoding:utf-8 -*-
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								import os
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								import re
 								import json
 								import time
-.22代码结构优化

											
										
										
											2024-10-22 10:06:22 +08:00
+								from flask_app.general.多线程提问 import multi_threading
-.6 优化解析

											
										
										
											2024-12-06 14:40:22 +08:00
+								from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
-.22代码结构优化

											
										
										
											2024-10-22 10:06:22 +08:00
+								from flask_app.general.json_utils import clean_json_string
-.6 优化解析

											
										
										
											2024-12-06 14:40:22 +08:00
+								from flask_app.工程标.提取json工程标版 import convert_clause_to_json
-.22代码结构优化

											
										
										
											2024-10-22 10:06:22 +08:00
+								from flask_app.general.通义千问long import upload_file
-.1适配货物标

											
										
										
											2024-11-01 14:39:39 +08:00
+								from flask_app.general.merge_pdfs import merge_pdfs
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								prompt = """
 								# 角色
 								你是一个文档处理专家，专门负责理解和操作基于特定内容的文档任务，这包括解析、总结、搜索或生成与给定文档相关的各类信息。
 								## 技能
 								### 技能 1：文档解析与摘要
 								- 深入理解并分析${document1}的内容，提取关键信息。
 								- 根据需求生成简洁明了的摘要，保持原文核心意义不变。
 								### 技能 2：信息检索与关联
 								- 在${document1}中高效检索特定信息或关键词。
 								- 能够识别并链接到文档内部或外部的相关内容，增强信息的连贯性和深度。
 								## 限制
 								- 所有操作均需基于${document1}的内容，不可超出此范围创造信息。
 								- 在处理敏感或机密信息时，需遵守严格的隐私和安全规定。
 								- 确保所有生成或改编的内容逻辑连贯，无误导性信息。
 								请注意，上述技能执行时将直接利用并参考${document1}的具体内容，以确保所有产出紧密相关且高质量。
 								"""
-.3表格提取bug 联合体投标

											
										
										
											2024-09-03 18:04:05 +08:00
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								def update_json_data(original_data, updates1, updates2,second_response_list):
-.5 资格审查部分返回更能对应原文

											
										
										
											2024-09-05 18:00:40 +08:00
+								    """
 								    根据提供的更新字典覆盖原始JSON数据中对应的键值，支持点分隔的键来表示嵌套结构。
 								    参数:
 								    - original_data: dict, 原始的JSON数据。
 								    - updates: dict, 包含需要更新的键值对。
 								    - second_response_list: list, 包含多个字典，每个字典包含需要更新的键值对。
 								    返回:
 								    - updated_data: dict, 更新后的JSON数据。
 								    """
 								    def recursive_update(data, key, value):
 								        # 处理点分隔的键，递归定位并更新嵌套字典
 								        keys = key.split('.')
 								        for k in keys[:-1]:
 								            data = data.setdefault(k, {})
 								        if isinstance(value, dict) and isinstance(data.get(keys[-1], None), dict):
 								            data[keys[-1]] = {**data.get(keys[-1], {}), **value}
 								        else:
 								            data[keys[-1]] = value
 								    # 合并 updates 到 original_data 中
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								    for key, value in updates1.items():
 								        recursive_update(original_data, key, value)
 								    for key,value in updates2.items():
-.5 资格审查部分返回更能对应原文

											
										
										
											2024-09-05 18:00:40 +08:00
+								        recursive_update(original_data, key, value)
 								    # 遍历 second_response_list 中的每个字典，并合并到 original_data 中
 								    for response_dict in second_response_list:
 								        for key, value in response_dict.items():
 								            recursive_update(original_data, key, value)
 								    return original_data
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								def judge_second_jump(input_dict):
 								    formatted_questions = []
 								    keys_to_remove = []
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    chapter_pattern = re.compile(r'招\s*标\s*公\s*告|第\s*一\s*章')
 								    question_template = (
 								        "请你根据招标文件中的内容回答：关于{short_key}{additional_text}，第一章招标公告中的内容是怎样的？"
 								        "请按json格式给我提供信息，键名为'{full_key}'，而键值需要完全与原文保持一致，不要擅自总结、删减，"
 								        "如果存在未知信息，请在对应键值处填'未知'。"
 								    )
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								    for key, value in input_dict.items():
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        if chapter_pattern.search(value):
 								            short_key = key.split('.')[-1] if '.' in key else key
 								            additional_text = get_additional_text(key)
 								            formatted_entry = question_template.format(
 								                short_key=short_key,
 								                full_key=key,
 								                additional_text=additional_text
 								            )
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								            formatted_questions.append(formatted_entry)
 								            keys_to_remove.append(key)  # 标记需要删除的键
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								    # 从原始字典中删除处理过的键值对
 								    for key in keys_to_remove:
 								        del input_dict[key]
 								    return input_dict, formatted_questions
-.5 资格审查部分返回更能对应原文

											
										
										
											2024-09-05 18:00:40 +08:00
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								def extract_matching_keys(original_data):
 								    """
 								    提取带序号的内容，对于已经完整的内容不提取，但会在后面用新数据更新orginal_data
 								    :param json_data:
 								    {
 								  "投标要求": {
 								    "资质要求": "符合第3章第2条规定",
 								    "业绩要求": "满足招标文件第4章相关条目"
 								  },
 								  "其他信息": {
 								    "投标文件格式": "按要求填写"
 								  }
 								}
 								    :return:
 								    [
 								        {"投标要求.资质要求": "符合第3章第2条规定"},
 								        {"投标要求.业绩要求": "满足招标文件第4章相关条目"}
 								    ]
 								    """
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								    #'投标人名称': '符合招标公告第3.2条规定'
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								    # 函数首先检查输入 json_data 是否为字符串类型。如果是，它会使用 json.loads() 将字符串解析为字典。
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    if isinstance(original_data, str):
 								        data = json.loads(original_data)
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								    else:
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        data = original_data
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
 								    # 正则表达式匹配
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								    include_patterns = [re.compile(r"符合第"), re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),re.compile(r"第.*?条")]
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								    additional_include_patterns = [re.compile(r"规定"), re.compile(r"条目")]
 								    exclude_patterns = ["投标文件格式", "权利义务", "技术标准","工程量清单"]
 								    additional_exclude_patterns = ["按要求", "按规定"]
 								    # Initialize a list to hold filtered key-value pairs
 								    final_matching = []
 								    # Recursive function to traverse and filter data
 								    def recursive_search(current_data, path=[]):
 								        if isinstance(current_data, dict):
 								            for key, value in current_data.items():
 								                new_path = path + [key]  # Update path for nested keys
 								                if isinstance(value, (dict, list)):
 								                    recursive_search(value, new_path)
 								                else:
 								                    process_value(key, str(value), new_path)
 								        elif isinstance(current_data, list):
 								            for item in current_data:
 								                recursive_search(item, path)
 								    # Function to process each value against the patterns
 								    def process_value(key, value, path):
 								        # Check exclude patterns first
 								        if any(ex in key or ex in value for ex in exclude_patterns):
 								            return
 								        # Main include patterns
 								        if any(pattern.search(value) for pattern in include_patterns):
 								            # Additional exclude patterns
 								            if not any(ex in key or ex in value for ex in additional_exclude_patterns):
 								                # Additional include patterns
 								                if any(pattern.search(value) for pattern in additional_include_patterns):
 								                    final_matching.append({".".join(path): value})  # Use dot notation for nested keys
 								    # Start the recursive search
 								    recursive_search(data)
 								    return final_matching
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    #同类表达扩展
 								def get_additional_text(key):
 								    additional_text_map = {
 								        "投标内容": "（或招标范围）",
 								        "工程质量": "（或质量要求）",
 								        # 可以在这里添加更多的映射
 								    }
 								    for keyword, text in additional_text_map.items():
 								        if keyword in key:
 								            return text
 								    return ""
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
 								def process_tender_entries(match_keys, output_folder, truncate0_jsonpath, clause_path):
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    """
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    处理招标条目，根据提供的匹配键生成不同的输出。
 								    :param match_keys: 匹配键列表，每个键是一个字典，格式如下：
 								        [
 								            {"资质要求": "符合相关资质要求"},
 								            {"业绩要求": "3.1.2 近三年完成类似项目"},
 								            {"财务要求": "招标公告 2.3 财务状况良好"},
 								            {"人员要求": "项目经理具有相关资格"}
 								        ]
 								    :param output_folder: 输出文件夹路径
 								    :param truncate0_jsonpath: 截断 JSON 路径
 								    :param clause_path: 条款路径
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    :return:
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								        - entries_with_numbers: 包含数字信息的条目列表（可能为空）
 								        - formatted_questions: 格式化的问题列表
 								        - entries_with_numbers2: 包含数字信息的第二类条目列表（可能为空）
 								        - clause2_path: 第二类条目的 JSON 路径（可能为空）
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    """
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    truncate0_empty = not truncate0_jsonpath or not os.path.exists(truncate0_jsonpath)
 								    clause_empty = not clause_path or not os.path.exists(clause_path)
 								    both_empty = truncate0_empty and clause_empty
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								    entries_with_numbers = []
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    entries_with_numbers2 = []
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								    formatted_questions = []
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								    clause2_path = ""
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
 								    # 正则表达式，同时匹配全角和半角括号
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    pattern = re.compile(r'(\d+(?:\.\d+)+)(?:[$\（](\d+)[$\）])?')
 								    # 新的正则表达式，用于匹配 '招标公告' 和 '第一章'，允许内部有空格
 								    chapter_pattern = re.compile(r'招\s*标\s*公\s*告|第\s*一\s*章')
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
-.26 分段解析完全版

											
										
										
											2024-09-26 18:06:23 +08:00
+								    question_template = (
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								        "请你根据招标文件中的内容回答：关于{short_key}{additional_text}，{revised_value}的内容是怎样的？"
 								        "请按json格式给我提供信息，键名为'{full_key}'，而键值需要完全与原文保持一致，不要擅自总结、删减，"
 								        "如果存在未知信息，请在对应键值处填'未知'。"
-.26 分段解析完全版

											
										
										
											2024-09-26 18:06:23 +08:00
+								    )
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
 								    def format_num(match):
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								        """格式化匹配到的数字"""
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        return match.group(1) + (f"({match.group(2)})" if match.group(2) else "")
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    def add_formatted_question(key, value):
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								        """根据键值生成格式化的问题并添加到 formatted_questions 列表中"""
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        revised_value = value.replace('符合', '')
 								        short_key = key.split('.')[-1] if '.' in key else key
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								        additional_text = get_additional_text(key)  # 确保该函数已定义
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        formatted_questions.append(question_template.format(
 								            short_key=short_key,
 								            full_key=key,
 								            revised_value=revised_value,
 								            additional_text=additional_text
 								        ))
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								        # 标记是否已经生成 clause2_path
 								    clause2_generated = False
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								    for entry in match_keys:
 								        key, value = next(iter(entry.items()))
 								        match = pattern.search(value)
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								        if chapter_pattern.search(value):         #招标公告
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								            if match:
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								                if not clause2_generated:
 								                    notice_path = next(
 								                        (os.path.join(root, file) for root, _, files in os.walk(output_folder)
 								                         for file in files if file.endswith('_notice.pdf')), ""
 								                    )
 								                    if notice_path:
 								                        clause2_path = convert_clause_to_json(notice_path, output_folder, 2)  # 确保该函数已定义
 								                        clause2_generated = True  # 标记已经生成
 								                        entries_with_numbers2.append({key: format_num(match)})
 								                    else:
 								                        add_formatted_question(key, value)
 								                else:
 								                    # 如果已经生成 clause2_path，直接添加到 entries_with_numbers2
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								                    entries_with_numbers2.append({key: format_num(match)})
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								            else:
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								                add_formatted_question(key, value)
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								        elif match:
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								            if both_empty:
 								                add_formatted_question(key, value)
 								            else:
 								                entries_with_numbers.append({key: format_num(match)})
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								        else:
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								            add_formatted_question(key, value)
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    # 如果 clause2_path 为空，不生成 entries_with_numbers2
 								    if not clause2_path:
 								        entries_with_numbers2 = []
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    return entries_with_numbers, formatted_questions, entries_with_numbers2, clause2_path
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								def fetch_specific_pdf(output_folder):
 								    pdf_paths = []
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    baseinfo_found = False
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    # 遍历指定目录
 								    for root, dirs, files in os.walk(output_folder):
 								        for file in files:
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								            # 检查是否找到 merged_baseinfo.pdf
 								            if file.endswith('merged_baseinfo.pdf'):
 								                baseinfo_found = True
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								                full_path = os.path.join(root, file)
 								                pdf_paths.append(full_path)
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								            elif file.endswith('tobidders_notice.pdf'):
 								                # 将找到的 tobidders_notice.pdf 添加到列表中
 								                full_path = os.path.join(root, file)
 								                pdf_paths.append(full_path)
 								            elif not baseinfo_found and file.endswith('invalid.pdf'):
 								                # 如果 merged_baseinfo.pdf 不存在，则返回以 invalid.pdf 结尾的文件路径
 								                return os.path.join(root, file)
 								    if baseinfo_found and pdf_paths:
 								        # 如果找到了 merged_baseinfo.pdf 和其他文件，则进行合并
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        output_path = os.path.join(output_folder, "merged_reviews.pdf")
-.12 解决了merged_baseinfo_path_more为空的bug

											
										
										
											2024-12-12 18:03:04 +08:00
+								        output_path=merge_pdfs(pdf_paths, output_path)
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        return output_path
 								    else:
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								        # 如果未找到 merged_baseinfo.pdf 且没有 invalid.pdf 匹配
 								        return ""
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, clause_json_path):
 								    combined_results2 = {}
-.6部分文件引用了招标公告中的内容，添加了对二次跳转的处理

											
										
										
											2024-09-06 15:50:52 +08:00
+								    first_response_list = []
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    matched_keys = extract_matching_keys(original_dict_data)
-.23测试分段

											
										
										
											2024-09-23 14:13:52 +08:00
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    entries_with_numbers, formatted_questions1, entries_with_numbers2, clause2_path = process_tender_entries(matched_keys, output_folder,truncate0_jsonpath,clause_json_path)
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    combined_results = process_and_merge_entries(entries_with_numbers, truncate0_jsonpath, clause_json_path)   #返回值可能为{}
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    combined_results1, formatted_questions2 = judge_second_jump(combined_results)     #返回值可能为{} []
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
 								    if entries_with_numbers2:
 								        combined_results2 = process_and_merge2(entries_with_numbers2, clause2_path)
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    formatted_questions = formatted_questions1 + formatted_questions2
 								    if formatted_questions:
-.8 评标修改 技术参数修改

											
										
										
											2024-11-11 17:12:38 +08:00
+								        output_path = fetch_specific_pdf(output_folder)  #合并merged_info
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        if output_path:
 								            file_id = upload_file(output_path)
 								            results = multi_threading(formatted_questions, "", file_id, 2)
 								            first_response_list = [clean_json_string(res) for _, res in results] if results else []
-.15 工程标资格审查提示词重改

											
										
										
											2024-11-15 14:43:10 +08:00
+								            print(first_response_list)
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
 								    updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list)
 								    return updated_json
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
-.31

											
										
										
											2024-10-31 20:12:08 +08:00
+								#TODO:现在是clause truncate_jsonpath非空就尝试从中提取，但是有可能从里面提取失败，这个时候可能需要重新问大模型。
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								if __name__ == "__main__":
 								    start_time=time.time()
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    # knowledge_name="zbfile"
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    truncate_tobidders_table_json_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\3395fc1b-432d-407d-a872-fc35e8475aef\\truncate_output.json"
 								    clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\3395fc1b-432d-407d-a872-fc35e8475aef\\clause1.json"
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								    original_dict_data={
 								        "形式评审标准": {
 								            "投标文件": "投标文件能正常打开",
 								            "投标人名称": "与营业执照、资质证书、安全生产许可证一致",
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								            "投标文件签字盖章": "符合第二章“投标人须知”第 3.7.3（3）目规定",
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								            "投标文件格式、内容": "符合第八章“投标文件格式”的要求，实质性内容齐全、关键字迹清晰可辨",
 								            "联合体投标人（如有）": "提交联合体协议书，并明确联合体牵头人",
 								            "报价唯一": "只能有一个有效报价（指投标函中的大写报价）",
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								            "多标段投标": "符合第一章规定",
 								            "“技术暗标”": "符合第二章“投标人须知”第 3.7.4（5）目规定"
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								        },
 								        "响应性评审标准": {
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								            "投标内容": "符合第二章“投标人须知”第 1.3.1 项规定",
 								            "工期": "符合第二章“投标人须知”第 1.3.2 项规定",
 								            "工程质量": "符合招标公告投标人须知”第 1.3.3 项规定",
 								            "投标有效期": "符合第二章“投标人须知”第 3.3.1 项规定",
-.21工程标快速版

											
										
										
											2024-10-21 17:31:48 +08:00
+								            "投标保证金": "符合第二章“投标人须知”规定",
 								            "权利义务": "投标函附录中的相关承诺符合或优于第四章“合同条款及格式”的相关规定",
 								            "技术标准和要求": "符合第七章“技术标准和要求”规定",
 								            "分包计划": "符合第二章“投标人须知”规定",
 								            "算术错误修正": "1)投标人接受算术错误修正后的报价 2)修正后的报价与投标报价相比偏差率不超过±1%",
 								            "投标价格": "1）投标函中的大写报价与已标价工程量清单中的投标总价一致 2）投标函中的大写报价不大于本标段招标控制价总价 3）算术错误修正后的投标总报价不大于本标段招标控制价总价 4）投标报价不低于其成本",
 								            "已标价工程量清单": "1）已标价工程量清单项目编码顺序与第五章“工程量清单”给出的项目编码顺序一致； 2）已标价工程量清单符合第五章“工程量清单”给出的项目编码、项目名称、项目特征、计量单位和工程量； 3）暂列金额符合第五章“工程量清单”列出的金额； 4）专业工程暂估价符合第五章“工程量清单”列出的金额； 5）材料（工程设备）暂估价符合第五章“工程量清单”列出的单价并计入综合单价； 6）安全文明施工费、规费和税金等不可竞争费用，按照规定的标准计取； 7）计税方法符合招标文件的约定； 8）已标价工程量清单项目未填报的项目不超过三项，或不超过三项未填报的项目的费用合计（按招标控制价相应项目的费用合计计算）不超过其投标总报价（修正后的投标总报价，如有）的±1%"
 								        }
 								    }
 								    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test"
 								    # notice_path="C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_notice.pdf"
-.30健壮性优化

											
										
										
											2024-10-30 16:56:05 +08:00
+								    formal_json = process_reviews(original_dict_data,output_folder, truncate_tobidders_table_json_path, "")
-.5 资格审查部分返回更能对应原文

											
										
										
											2024-09-05 18:00:40 +08:00
+								    data = json.dumps(formal_json, ensure_ascii=False, indent=4)
-.29

											
										
										
											2024-08-29 16:37:09 +08:00
+								    end_time=time.time()
 								    elapsed_time = end_time - start_time
 								    print(f"Function execution took {elapsed_time} seconds.")
-.5 资格审查部分返回更能对应原文

											
										
										
											2024-09-05 18:00:40 +08:00
+								    print(data)
-.4截取pdf提升健壮性，考虑截取失败的情况

											
										
										
											2024-09-04 17:49:05 +08:00
 								#关于'技术暗标'，第二章“投标人须知”规定的内容是怎样的？请按json格式给我提供信息，键名为'技术暗标',请你忠于原文，回答要求完整准确，不要擅自总结、删减，且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。