zbparse/flask_app/货物标/评分标准提取main.py

# -*- encoding:utf-8 -*-
import json
import re
from collections import defaultdict
from flask_app.main.通义千问long import upload_file, qianwen_long


def combine_technical_and_business(data, target_values):
    extracted_data = {}  # 根级别存储所有数据
    technical_found = False
    business_found = False

    def extract_nested(data, parent_key='', is_technical=False, is_business=False):
        nonlocal technical_found, business_found
        if isinstance(data, dict):
            for key, value in data.items():
                current_key = f"{parent_key}.{key}" if parent_key else key

                # 检查是否为技术标的内容
                if any(target in key for target in target_values):
                    if not is_technical:
                        extracted_data[key] = value
                        technical_found = True
                        continue

                # 默认其他所有内容都归为商务标
                else:
                    if not is_business:
                        if '商务标' not in extracted_data:
                            extracted_data['商务标'] = {}
                        extracted_data['商务标'][key] = value
                        business_found = True
                        continue

                if isinstance(value, dict) or isinstance(value, list):
                    extract_nested(value, current_key, is_technical, is_business)

        elif isinstance(data, list):
            for index, item in enumerate(data):
                extract_nested(item, f"{parent_key}[{index}]", is_technical, is_business)

    extract_nested(data)

    if not technical_found:
        extracted_data['技术标'] = ''
    if not business_found:
        extracted_data['商务标'] = ''

    return extracted_data


# 防止外键只有一个'一包'的情况
def process_data_based_on_key(data):
    exclude_word = ["包", "未知", "评分因素"]
    # 获取字典的键列表
    keys = list(data.keys())
    # 检查键的数量是否为1并且 exclude_word 中的任何词包含在 keys[0] 中
    if len(keys) == 1 and any(word in keys[0] for word in exclude_word):
        # 返回内层的字典
        return data[keys[0]]

    # 如果条件不满足，则返回原始字典
    return data


def parse_json_with_duplicates(raw_string):
    """
        解析具有重复键的 JSON 字符串，将所有重复的键值对存储为列表。

        Args:
            json_string (str): 需要解析的 JSON 字符串。

        Returns:
            dict: 解析后的字典，重复的键对应的值为列表。

        eg:输入："综合实力": {
                "评分": "2分",
                "要求": "投标人具备电子与智能化工程专业承包二级资质及以上证书得 2分，不能够提供不得分（开标时需提供原件）。"
            },
            "综合实力": {
                "评分": "2分",
                "要求": "投标人具有建筑机电安装工程专业承包三级资质或以上资质得 2分，否则不得分。（证书开标原件备查）。"
            }
            输出："综合实力": [
            {
                "评分": "2分",
                "要求": "投标人具备电子与智能化工程专业承包二级资质及以上证书得 2分，不能够提供不得分（开标时需提供原件）。"
            },
            {
                "评分": "2分",
                "要求": "投标人具有建筑机电安装工程专业承包三级资质或以上资质得 2分，否则不得分。（证书开标原件备查）。"
            }]
    """

    def custom_object_pairs_hook(pairs):
        d = defaultdict(list)
        for key, value in pairs:
            # 如果值是字典或列表，递归处理
            if isinstance(value, dict):
                value = process_dict(value)
            elif isinstance(value, list):
                value = process_list(value)
            d[key].append(value)
        # 将有多个值的键转换为列表，单个值的键保持原样
        return {key: (values if len(values) > 1 else values[0]) for key, values in d.items()}

    def process_dict(d):
        """
        递归处理字典，确保所有重复键的值为列表。

        Args:
            d (dict): 需要处理的字典。

        Returns:
            dict: 处理后的字典。
        """
        return custom_object_pairs_hook(d.items())

    def process_list(l):
        """
        递归处理列表，确保列表中的所有字典也被处理。

        Args:
            l (list): 需要处理的列表。

        Returns:
            list: 处理后的列表。
        """
        return [process_dict(item) if isinstance(item, dict) else item for item in l]

    """输入字符串，提取 { 和 } 之间的内容，并将其解析为字典"""
    if not raw_string.strip():
        return {}
    match = re.search(r'\{[\s\S]*\}', raw_string)
    if match:
        try:
            json_string = match.group(0)
            return json.loads(json_string, object_pairs_hook=custom_object_pairs_hook)
        except json.JSONDecodeError as e:
            print(f"json_utils: extract_content_from_json: JSON decode error: {e}")
            return {}
    else:
        print("json_utils: extract_content_from_json: No valid JSON content found.")
        return {}


def combine_evaluation_standards(truncate_file):
    file_id = upload_file(truncate_file)
    user_query1 = "根据该文档，你判断它是否有具体的关于技术评分或商务评分或投标报价的评分要求，如果有，返回'是'，否则返回'否'。"  # 应对竞争性谈判这种无评分要求的情况
    judge_res = qianwen_long(file_id, user_query1)
    # 默认 judge 为 True
    judge = True
    # 检查 judge_res 的内容
    if '否' in judge_res:
        judge = False
    if judge:
        # 执行 user_query 相关的逻辑
        # user_query = "根据该文档中的评标办法前附表或者评分标准表，请你列出该文件的技术评分，商务评分，投标报价评审标准以及它们对应的具体评分要求，外层键名分别为'技术评分','商务评分','投标报价'。如果评分内容不是这3个，则返回文档中给定的评分内容以及它的评分要求，都以json的格式返回结果，如果该采购活动有多个包，则最外层键名为对应的包名。请不要回答有关资格审查的内容"
        user_query = (
            """
            根据该文档中的评标办法前附表，请你列出该文件的技术评分，商务评分，投标报价评审以及它们对应的具体评分要求，请以json格式返回结果，请在这三大块评分中分别用若干键值对表示具体要求，请精确到具体的评审项，内层的键名为'评分'及'要求'，若这三大块评分中存在其他信息，则在相应评分大块中新增键名'备注'存放该信息，键值为具体的要求，否则不需要。如果评分内容（因素）不是这3个，则返回文档中给定的评分内容（因素）以及它们的具体评分要求，如果该采购活动有多个包，则最外层键名为对应的包名,否则不需要。不要回答有关资格审查的内容，若存在未知信息，填充'未知'。以下为示例输出：
            {
    "一包": {
        "技术评分": {
            "主要监理岗位的职责": {
                "评分": "4分",
                "要求": "1、总监理工程师的职责全面、清晰、合理得 1.2-2分；一般的1.2分。2、其他主要监理人员及岗位的职责全面、清晰、合理得 1.2-2分；一般的 1.2分。"
            }
        },
        "商务评分": {
            "控制系统内主板": {
                "评分": "10分",
                "要求": "所投电梯控制系统内主板为制造商原厂原品牌制造生产且为进口部件得 10分。（提供进口部件报关单及原产地证明扫描件加盖公章，否则不得分）"
            },
            "制造商技术实力": [
                {
                    "评分": "3分",
                    "要求": "一级证书得3分，二级证书得1分，其他不得分"
                },
                {
                    "评分": "2分",
                    "要求": "行业销量排名连续前 2 名，得 2 分，第 4-6 名得 0.5 分，其他不得分"
                }
            ]
        },
        "投标报价评审": {
            "投标报价是否出现违反计价规范": {
                "评分": "合格/不合格",
                "要求": "A:投标报价未违反计价规范的，评审意见为“合格”；B：投标报价违反计价规范的，评审意见为“不合格”"
            }
        }
    }
}
            """
        )
        # 执行第二个查询
        evaluation_res = qianwen_long(file_id, user_query)
        cleaned_evaluation_res = parse_json_with_duplicates(evaluation_res)
        result_data = process_data_based_on_key(cleaned_evaluation_res)
        include = ['一包', '二包', '三包', '四包', '五包']
        target_values = ['技术', '设计', '实施']
        updated_jsons = {}

        # 检查是否有外层键匹配 include 列表
        if any(key for key in result_data if any(included in key for included in include)):
            # 有匹配的项，处理这些项
            for key in result_data:
                if any(item in key for item in include):
                    inner_dict = result_data[key]
                    updated_jsons[key] = combine_technical_and_business(inner_dict, target_values)
        else:
            # 没有匹配的项，对整个字典运行
            updated_jsons = combine_technical_and_business(result_data, target_values)
        return updated_jsons
    else:
        # 如果 judge 是 False，直接返回默认的技术标和商务标的结构
        result_data = {}
        result_data['技术标'] = ''
        result_data['商务标'] = ''
        return result_data


if __name__ == "__main__":
    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件_evaluation_method.pdf"
    # truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\2-招标文件（统计局智能终端二次招标）_evaluation_method.pdf"
    # truncate_file="C:\\Users\\Administrator\\Desktop\\货物标\\output2\\广水市妇幼招标文件最新（W改）_evaluation_method.pdf"
    truncate_file = "C:\\Users\\Administrator\\Desktop\\货物标\\output2\\622二次视频会议磋商文件_evaluation_method.pdf"
    res = combine_evaluation_standards(truncate_file)
    print(json.dumps(res, ensure_ascii=False, indent=4))