zbparse/flask_app/货物标/商务服务其他要求提取.py

# -*- encoding:utf-8 -*-
import json
import re
from PyPDF2 import PdfReader
import textwrap
from flask_app.general.doubao import read_txt_to_string, pdf2txt
from flask_app.general.json_utils import combine_json_results, clean_json_string
from flask_app.general.通义千问long import upload_file, qianwen_long_stream
from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
from flask_app.general.format_change import docx2pdf, pdf2docx
import concurrent.futures
from flask_app.general.doubao import doubao_model


# 正则表达式判断原文中是否有商务、服务、其他要求
def find_exists(truncate_file, required_keys):
    # if not truncate_file:
    #     return ["技术要求", "商务要求", "服务要求", "其他要求"]
    common_header = extract_common_header(truncate_file)  # 假设该函数已定义
    pdf_document = PdfReader(truncate_file)

    # 定义正则模式
    begin_pattern = re.compile(
        r'(?:^第[一二三四五六七八九十百千]+(?:章|部分)\s*'  # 匹配“第X章”或“第X部分”
        r'[\u4e00-\u9fff、()（）]*?'  # 匹配允许的字符
        r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()（）]*?要求|'  # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
        r'(?:采购|需求)[\u4e00-\u9fff、()（）]*?)'  # 匹配“采购”或“需求”
        r'\s*$|'  # 匹配行尾
        r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*?'  # 匹配“第X章”后带“采购内容”等，排除“说明”
        r'(?:采购内容|采购要求|需求).*|'  # 匹配“采购内容”或“采购要求”关键词
        r'^[一二三四五六七八九十百千]+、\s*采购清单)'  # 匹配“一、采购清单”
        r'\s*$',  # 匹配行尾
        re.MULTILINE
    )
    end_pattern = re.compile(
        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)

    # 只处理第一页和最后一页
    first_page = pdf_document.pages[0].extract_text() or ""
    last_page = pdf_document.pages[-1].extract_text() or ""

    # 清理页面内容
    first_page_clean = clean_page_content(first_page, common_header)
    last_page_clean = clean_page_content(last_page, common_header)

    # 在第一页寻找起始位置
    start_match = re.search(begin_pattern, first_page_clean)
    if not start_match:
        print("未找到开始模式，返回完整第一页")
        first_content = first_page_clean
    else:
        start_index = start_match.end()
        first_content = first_page_clean[start_index:]

    # 在最后一页寻找结束位置
    end_match = re.search(end_pattern, last_page_clean)
    if not end_match:
        print("未找到结束模式，返回完整最后一页")
        last_content = last_page_clean
    else:
        last_content = last_page_clean[:end_match.start()]

    # 获取中间页面的内容
    middle_content = ""
    if len(pdf_document.pages) > 2:
        for page_num in range(1, len(pdf_document.pages) - 1):
            page_text = pdf_document.pages[page_num].extract_text() or ""
            cleaned_text = clean_page_content(page_text, common_header)
            middle_content += cleaned_text + "\n"

    # 组合所有内容
    relevant_text = first_content + "\n" + middle_content + "\n" + last_content
    relevant_text = re.sub(r'\s+', ' ', relevant_text)

    # print(f"提取的内容范围:\n{relevant_text}")

    # 匹配所需的要求
    matched_requirements = []
    punctuation = r"[，。？！、；：,.?!]*"
    for req in required_keys:
        if re.search(req, relevant_text):
            # 替换特定的关键词
            if req in [r"总\s*体\s*要\s*求", r"建\s*设\s*要\s*求"]:
                # 匹配到“总体要求”或“建设要求”时，添加“技术要求”而非原关键词
                tech_req = r"技\s*术\s*要\s*求"
                if tech_req not in matched_requirements:
                    matched_requirements.append(tech_req)
            elif req in [r"培\s*训\s*要\s*求",r"质\s*保\s*要\s*求",r"售\s*后\s*要\s*求"]:
                # 匹配到“培训要求”时，添加“服务要求”而非原关键词
                service_req = r"服\s*务\s*要\s*求"
                if service_req not in matched_requirements:
                    matched_requirements.append(service_req)
            elif req in [r"进\s*度\s*要\s*求",r"工\s*期\s*要\s*求"]:
                busi_req = r"商\s*务\s*要\s*求"
                if busi_req not in matched_requirements:
                    matched_requirements.append(busi_req)
            elif req == r"服\s*务\s*要\s*求":
                # 处理“服务要求”时的特殊逻辑
                lines = [line for line in relevant_text.split('\n') if re.search(req, line)]
                pattern = r"技\s*术" + punctuation + req
                if any(re.search(pattern, line) for line in lines):
                    combined_req = r"技\s*术\s*、\s*服\s*务\s*要\s*求"
                    if combined_req not in matched_requirements:
                        matched_requirements.append(combined_req)
                else:
                    if req not in matched_requirements:
                        matched_requirements.append(req)
            else:
                # 对于其他匹配的关键词，直接添加
                if req not in matched_requirements:
                    matched_requirements.append(req)
    # 去除 \s*，仅返回原始关键词
    clean_requirements = [re.sub(r'\\s\*', '', req) for req in matched_requirements]

    # 判断互斥关系：如果有"技术、服务要求"，删除"技术要求"和"服务要求"
    if "技术、服务要求" in clean_requirements:
        clean_requirements = [req for req in clean_requirements if req not in ["技术要求", "服务要求"]]
        # 确保最终返回的列表仅包含指定的五项
    # allowed_requirements = {"技术要求", "服务要求", "商务要求", "其他要求", "技术、服务要求"}
    # final_requirements = [req for req in clean_requirements if req in allowed_requirements]
    return clean_requirements


def generate_queries(truncate_file, required_keys):
    key_list = find_exists(truncate_file, required_keys)
    queries = []
    user_query_template = "这是一份货物标中采购要求部分的内容，请告诉我\"{}\"是什么，请以json格式返回结果，外层键名是\"{}\"，内层键值对中的键名是原文中的标题或者是你对相关子要求的总结，而键值需要完全与原文保持一致，不可擅自总结删减，注意你无需回答采购清单中具体设备的技术参数要求，仅需从正文部分开始提取，"
    for key in key_list:
        query_base = user_query_template.format(key, key)
        other_keys = [k for k in key_list if k != key]
        if other_keys:
            query_base += "也不需要回答\"{}\"中的内容，".format("\"和\"".join(other_keys))
        query_base += "若相关要求不存在，在键值中填'未知'。"
        queries.append(query_base)
        # print(query_base)
    return queries


def generate_template(required_keys,full_text, type=1):
    # 定义每个键对应的示例内容
    example_content1 = {
        "技术要求": ["相关技术要求1", "相关技术要求2"],
        "服务要求": ["服务要求1", "服务要求2", "服务要求3"],
        "商务要求": ["商务要求1", "商务要求2"],
        "其他要求": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2...", "关于项目采购的其他要求3..."],
        "技术、服务要求": ["相关技术、服务要求内容1", "相关技术、服务要求内容2", "相关技术、服务要求内容3"]
    }
    example_content2 = {
        "技术要求": {
            "子因素名1": ["相关技术要求1", "相关技术要求2"]
        },
        "服务要求": {
            "子因素名1": ["服务要求1"],
            "子因素名2": ["服务要求2", "服务要求3"]
        },
        "商务要求": {
            "子因素名1": ["商务要求1"],
            "子因素名2": ["商务要求2"]
        },
        "其他要求": {
            "子因素名1": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2..."],
            "子因素名2": ["关于项目采购的其他要求3..."]
        },
        "技术、服务要求": {
            "子因素名1": ["相关技术、服务要求内容1"],
            "子因素名2": ["相关技术、服务要求内容2", "相关技术、服务要求内容3"]
        }
    }
    # 将 required_keys 转换为集合以便于操作
    keys = set(required_keys)
    type_to_keys_map = {
        1: ["服务要求", "商务要求", "其他要求"],
        2: ["技术要求", "技术、服务要求"]
    }
    # 根据 type 获取对应的 all_possible_keys
    chosen_keys = type_to_keys_map.get(type, [])
    another_keys_list = type_to_keys_map.get(3 - type, [])  # 3 - type 将 type 1 映射到 2，反之亦然
    another_keys_str = ', '.join([f"'{key}'" for key in another_keys_list])
    # 处理互斥关系：如果 "技术要求" 和 "服务要求" 同时存在，则移除 "技术、服务要求"
    if "技术要求" in keys and "服务要求" in keys:
        keys.discard("技术、服务要求")
    # 如果 "技术、服务要求" 存在，则移除 "技术要求" 和 "服务要求"
    elif "技术、服务要求" in keys:
        keys.discard("技术要求")
        keys.discard("服务要求")

    # 确保 keys 中只包含允许的键
    keys = keys.intersection(chosen_keys)
    # 按照预定义的顺序排序键，以保持一致性
    sorted_keys = [key for key in chosen_keys if key in keys]

    # 如果没有任何键被选中，返回""
    if not sorted_keys:
        return ""

    # 生成模板的通用部分
    def generate_prompt_instruction(keys_str, outer_keys_str, another_keys_str, type):
        if type == 1:
            specific_instructions = textwrap.dedent(
                """4. 若章节开头位置或者采购清单中除了需要采购的货物、数量、单位之外，还有带三角▲或五角星★的描述内容（如工期要求、进度要求、品牌要求等商务要求），也请将该部分内容提取出来，添加在外层键名为'商务要求'的键值部分；若存在标题包含'工期要求'、'进度要求'等和商务要求有关的关键字，也请将该标题下的内容提取，添加在外层键名为'商务要求'的键值部分。请不要遗漏这部分的'商务要求'。
    5. 在提取'服务要求'的时候，若原文（包含正文和表格）中存在'安装要求'、'售后要求'、'维护要求'、'培训要求'等服务相关的要求说明，请添加至'服务要求'的键值部分，不要遗漏这部分的'服务要求'。
                """
            )
        else:
            specific_instructions = textwrap.dedent(
                """4. 在提取技术要求或技术、服务要求时，你无需从采购清单或表格中提取技术要求以及参数要求，你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容，从正文中提取；若技术要求或技术服务要求的内容全在表格中，键值为空列表[]。
    5. 在提取'技术要求'时，注意不要提取有关'安装、售后、维护、运维、培训、质保'等要求，它们不属于'技术要求'，但是若大标题中包含'总体要求''建设要求'等和技术要求相关的关键字，请添加到'技术要求'的键值部分。
                """
            )
        return textwrap.dedent(
            f"""请你根据该货物类招标文件中的采购要求部分内容，请告诉我该项目采购的{keys_str}分别是什么，请以json格式返回结果，默认情况下外层键名是{outer_keys_str}，键值为字符串列表，每个字符串表示具体的一条要求，可以按原文中的序号作划分（若有序号的话），请按原文内容回答，保留三角▲、五角星★和序号（若有），不要擅自增删内容。
    
    注意事项：
    1. 若相应要求下存在子标题表示子要求因素但不具备实际的含义、要求，可以将它忽略而不是将它与下文具体要求进行多行合并，或者作为该要求下的嵌套键名，总之字符串列表中只提取具体的要求。
    2. 请不要提取{another_keys_str}中的内容。

    要求与指南：
    1. JSON 的结构要求：
        - 默认情况无需嵌套键值对，键值为字符串列表；若存在嵌套结构(即有明确标题表示各子要求)，则嵌套键名是各子要求，嵌套键值为字符串列表，为该子要求下的若干要求，最多一层嵌套。
        - 每个外层键对应的值可以是：
            a. 一个字符串列表，表示具体的一条条要求。若只有一条要求，也用字符串列表表示。
            b. 一个对象（字典），其键为子因素名，值为字符串列表。
            c. 若文档中无符合的相关要求，键值为空列表[]
        - 最多只允许一层嵌套。
    2. 请优先且准确定位正文部分包含以下关键字的标题：{outer_keys_str}，在其之后提取'XX要求'相关内容，尽量避免在无关地方提取内容。
    3. 注意请不要返回Markdown表格语法，必要时使用冒号':'将相关信息拼接在一起 
    {specific_instructions}
    6. 字符串列表中的每个字符串内容需与原文内容保持一致，保留前面的三角▲、五角星★和序号（如果有），但不可以擅自添加这些内容。
    """)

    # 过滤示例内容
    def filter_example_content(example_content, keys):
        return {k: v for k, v in example_content.items() if k in keys}

    def format_example(example_content):
        return json.dumps(example_content, indent=4, ensure_ascii=False)

    filtered_example_content1 = filter_example_content(example_content1, sorted_keys)
    filtered_example_content2 = filter_example_content(example_content2, sorted_keys)
    tech_json_example1_str = format_example(filtered_example_content1)
    tech_json_example2_str = format_example(filtered_example_content2)
    keys_str = '、'.join(sorted_keys)
    outer_keys_str = ', '.join([f"'{key}'" for key in sorted_keys])
    prompt_instruction = generate_prompt_instruction(keys_str, outer_keys_str, another_keys_str, type)
    # 完整的用户查询模板，包含两份示例输出
    user_query_template = f"""
    {prompt_instruction}
    以下为示例输出，仅供格式参考：
    示例 1,无嵌套键值对：
    {tech_json_example1_str}
    示例 2，嵌套键值对形式：
    {tech_json_example2_str}
    """
    if full_text:
        user_query_template += f"\n\n文件内容：{full_text}"
    return user_query_template


def get_business_requirements(procurement_path, processed_filepath, model_type):
    required_keys = ["技\s*术\s*要\s*求", "商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求",
                     "总\s*体\s*要\s*求", "建\s*设\s*要\s*求", "进\s*度\s*要\s*求", "工\s*期\s*要\s*求",
                     "质\s*保\s*要\s*求", "培\s*训\s*要\s*求", "售\s*后\s*要\s*求"]

    # 将 doc/docx 转换为 pdf
    procurement_pdf_path = procurement_path
    if procurement_path.lower().endswith(('.doc', '.docx')):
        procurement_pdf_path = docx2pdf(procurement_path)

    # 查找包含的关键词
    contained_keys = find_exists(procurement_pdf_path, required_keys)
    print(contained_keys)
    if not contained_keys:
        return {}

    # 读取文件全文
    full_text = read_txt_to_string(processed_filepath)

    # 生成业务查询和技术查询
    busi_user_query = generate_template(contained_keys, full_text, 1)
    tech_user_query = generate_template(contained_keys, full_text, 2)

    # 初始化结果存储
    final_res = {}

    # 如果是非模型调用，需要提前上传文件并获取 file_id
    file_id = None
    if not model_type:
        procurement_docx_path=procurement_path
        if procurement_path.lower().endswith('.pdf'):
            procurement_docx_path = pdf2docx(procurement_path)
        file_id = upload_file(procurement_docx_path)  # 只上传一次文件，避免冗余调用

    # 并行处理业务和技术查询
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = []
        if busi_user_query:
            if model_type:
                # 如果是模型调用，直接使用 doubao_model
                futures.append(executor.submit(doubao_model, busi_user_query))
            else:
                # 使用 qianwen_long_stream 并传入 file_id
                futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1))

        if tech_user_query:
            if model_type:
                # 如果是模型调用，直接使用 doubao_model
                futures.append(executor.submit(doubao_model, tech_user_query))
            else:
                # 使用 qianwen_long_stream 并传入 file_id
                futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1))

        # 获取结果
        for future in concurrent.futures.as_completed(futures):
            try:
                result = future.result()
                if result:  # 确保结果不为空
                    final_res.update(clean_json_string(result))
            except Exception as e:
                print(f"An error occurred: {e}")

    return final_res


# TODO:改为先判断，再摘取
if __name__ == "__main__":
    # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
    # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件（广水市教育局封闭管理）_procurement.pdf"
    procurement_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
    docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
    # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
    # file_id = upload_file(truncate_file)
    # processed_filepath = pdf2txt(procurement_path)
    processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt'
    final_res= get_business_requirements(procurement_path,processed_filepath)
    print(json.dumps(final_res, ensure_ascii=False, indent=4))
-.8采购要求更加完整

											
										
										
											2024-11-08 15:44:29 +08:00
+								# -*- encoding:utf-8 -*-
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								import json
 								import re
 								from PyPDF2 import PdfReader
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								import textwrap
-.24 速率限制

											
										
										
											2024-11-25 09:15:56 +08:00
+								from flask_app.general.doubao import read_txt_to_string, pdf2txt
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								from flask_app.general.json_utils import combine_json_results, clean_json_string
 								from flask_app.general.通义千问long import upload_file, qianwen_long_stream
-.23 无效标优化

											
										
										
											2024-10-23 20:33:41 +08:00
+								from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
+								from flask_app.general.format_change import docx2pdf, pdf2docx
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								import concurrent.futures
-.23 修改商务要求bug

											
										
										
											2024-11-23 15:38:52 +08:00
+								from flask_app.general.doubao import doubao_model
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								# 正则表达式判断原文中是否有商务、服务、其他要求
 								def find_exists(truncate_file, required_keys):
 								    # if not truncate_file:
 								    #     return ["技术要求", "商务要求", "服务要求", "其他要求"]
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    common_header = extract_common_header(truncate_file)  # 假设该函数已定义
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								    pdf_document = PdfReader(truncate_file)
-.8采购要求更加完整

											
										
										
											2024-11-08 15:44:29 +08:00
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # 定义正则模式
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								    begin_pattern = re.compile(
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        r'(?:^第[一二三四五六七八九十百千]+(?:章|部分)\s*'  # 匹配“第X章”或“第X部分”
 								        r'[\u4e00-\u9fff、()（）]*?'  # 匹配允许的字符
 								        r'(?:(?:服务|项目|商务|技术)[\u4e00-\u9fff、()（）]*?要求|'  # 匹配“服务”、“项目”、“商务”或“技术”后跟“要求”
 								        r'(?:采购|需求)[\u4e00-\u9fff、()（）]*?)'  # 匹配“采购”或“需求”
 								        r'\s*$|'  # 匹配行尾
 								        r'^第[一二三四五六七八九十百千]+(?:章|部分)(?!.*说明).*?'  # 匹配“第X章”后带“采购内容”等，排除“说明”
 								        r'(?:采购内容|采购要求|需求).*|'  # 匹配“采购内容”或“采购要求”关键词
 								        r'^[一二三四五六七八九十百千]+、\s*采购清单)'  # 匹配“一、采购清单”
 								        r'\s*$',  # 匹配行尾
 								        re.MULTILINE
 								    )
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								    end_pattern = re.compile(
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff、()（）]+\s*$', re.MULTILINE)
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    # 只处理第一页和最后一页
 								    first_page = pdf_document.pages[0].extract_text() or ""
 								    last_page = pdf_document.pages[-1].extract_text() or ""
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    # 清理页面内容
 								    first_page_clean = clean_page_content(first_page, common_header)
 								    last_page_clean = clean_page_content(last_page, common_header)
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    # 在第一页寻找起始位置
 								    start_match = re.search(begin_pattern, first_page_clean)
 								    if not start_match:
 								        print("未找到开始模式，返回完整第一页")
 								        first_content = first_page_clean
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								    else:
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								        start_index = start_match.end()
 								        first_content = first_page_clean[start_index:]
 								    # 在最后一页寻找结束位置
 								    end_match = re.search(end_pattern, last_page_clean)
 								    if not end_match:
 								        print("未找到结束模式，返回完整最后一页")
 								        last_content = last_page_clean
 								    else:
 								        last_content = last_page_clean[:end_match.start()]
 								    # 获取中间页面的内容
 								    middle_content = ""
 								    if len(pdf_document.pages) > 2:
 								        for page_num in range(1, len(pdf_document.pages) - 1):
 								            page_text = pdf_document.pages[page_num].extract_text() or ""
 								            cleaned_text = clean_page_content(page_text, common_header)
 								            middle_content += cleaned_text + "\n"
 								    # 组合所有内容
 								    relevant_text = first_content + "\n" + middle_content + "\n" + last_content
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    relevant_text = re.sub(r'\s+', ' ', relevant_text)
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # print(f"提取的内容范围:\n{relevant_text}")
 								    # 匹配所需的要求
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								    matched_requirements = []
 								    punctuation = r"[，。？！、；：,.?!]*"
 								    for req in required_keys:
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        if re.search(req, relevant_text):
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								            # 替换特定的关键词
 								            if req in [r"总\s*体\s*要\s*求", r"建\s*设\s*要\s*求"]:
 								                # 匹配到“总体要求”或“建设要求”时，添加“技术要求”而非原关键词
 								                tech_req = r"技\s*术\s*要\s*求"
 								                if tech_req not in matched_requirements:
 								                    matched_requirements.append(tech_req)
 								            elif req in [r"培\s*训\s*要\s*求",r"质\s*保\s*要\s*求",r"售\s*后\s*要\s*求"]:
 								                # 匹配到“培训要求”时，添加“服务要求”而非原关键词
 								                service_req = r"服\s*务\s*要\s*求"
 								                if service_req not in matched_requirements:
 								                    matched_requirements.append(service_req)
 								            elif req in [r"进\s*度\s*要\s*求",r"工\s*期\s*要\s*求"]:
 								                busi_req = r"商\s*务\s*要\s*求"
 								                if busi_req not in matched_requirements:
 								                    matched_requirements.append(busi_req)
 								            elif req == r"服\s*务\s*要\s*求":
 								                # 处理“服务要求”时的特殊逻辑
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								                lines = [line for line in relevant_text.split('\n') if re.search(req, line)]
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								                pattern = r"技\s*术" + punctuation + req
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								                if any(re.search(pattern, line) for line in lines):
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								                    combined_req = r"技\s*术\s*、\s*服\s*务\s*要\s*求"
 								                    if combined_req not in matched_requirements:
 								                        matched_requirements.append(combined_req)
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								                else:
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								                    if req not in matched_requirements:
 								                        matched_requirements.append(req)
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								            else:
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								                # 对于其他匹配的关键词，直接添加
 								                if req not in matched_requirements:
 								                    matched_requirements.append(req)
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # 去除 \s*，仅返回原始关键词
 								    clean_requirements = [re.sub(r'\\s\*', '', req) for req in matched_requirements]
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # 判断互斥关系：如果有"技术、服务要求"，删除"技术要求"和"服务要求"
 								    if "技术、服务要求" in clean_requirements:
 								        clean_requirements = [req for req in clean_requirements if req not in ["技术要求", "服务要求"]]
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								        # 确保最终返回的列表仅包含指定的五项
 								    # allowed_requirements = {"技术要求", "服务要求", "商务要求", "其他要求", "技术、服务要求"}
 								    # final_requirements = [req for req in clean_requirements if req in allowed_requirements]
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    return clean_requirements
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								def generate_queries(truncate_file, required_keys):
 								    key_list = find_exists(truncate_file, required_keys)
 								    queries = []
-.8采购要求更加完整

											
										
										
											2024-11-08 15:44:29 +08:00
+								    user_query_template = "这是一份货物标中采购要求部分的内容，请告诉我\"{}\"是什么，请以json格式返回结果，外层键名是\"{}\"，内层键值对中的键名是原文中的标题或者是你对相关子要求的总结，而键值需要完全与原文保持一致，不可擅自总结删减，注意你无需回答采购清单中具体设备的技术参数要求，仅需从正文部分开始提取，"
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								    for key in key_list:
 								        query_base = user_query_template.format(key, key)
 								        other_keys = [k for k in key_list if k != key]
 								        if other_keys:
 								            query_base += "也不需要回答\"{}\"中的内容，".format("\"和\"".join(other_keys))
 								        query_base += "若相关要求不存在，在键值中填'未知'。"
 								        queries.append(query_base)
-.15货物标完整版

											
										
										
											2024-10-15 20:57:58 +08:00
+								        # print(query_base)
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								    return queries
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
-.12 豆包测试

											
										
										
											2024-12-12 10:56:46 +08:00
+								def generate_template(required_keys,full_text, type=1):
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # 定义每个键对应的示例内容
 								    example_content1 = {
-.23 修改商务要求bug

											
										
										
											2024-11-23 15:38:52 +08:00
+								        "技术要求": ["相关技术要求1", "相关技术要求2"],
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								        "服务要求": ["服务要求1", "服务要求2", "服务要求3"],
 								        "商务要求": ["商务要求1", "商务要求2"],
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								        "其他要求": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2...", "关于项目采购的其他要求3..."],
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								        "技术、服务要求": ["相关技术、服务要求内容1", "相关技术、服务要求内容2", "相关技术、服务要求内容3"]
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    }
 								    example_content2 = {
 								        "技术要求": {
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								            "子因素名1": ["相关技术要求1", "相关技术要求2"]
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        },
 								        "服务要求": {
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								            "子因素名1": ["服务要求1"],
 								            "子因素名2": ["服务要求2", "服务要求3"]
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        },
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								        "商务要求": {
 								            "子因素名1": ["商务要求1"],
 								            "子因素名2": ["商务要求2"]
 								        },
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								        "其他要求": {
 								            "子因素名1": ["关于项目采购的其他要求1...", "关于项目采购的其他要求2..."],
 								            "子因素名2": ["关于项目采购的其他要求3..."]
 								        },
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        "技术、服务要求": {
 								            "子因素名1": ["相关技术、服务要求内容1"],
 								            "子因素名2": ["相关技术、服务要求内容2", "相关技术、服务要求内容3"]
 								        }
 								    }
 								    # 将 required_keys 转换为集合以便于操作
 								    keys = set(required_keys)
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    type_to_keys_map = {
 : ["服务要求", "商务要求", "其他要求"],
 : ["技术要求", "技术、服务要求"]
 								    }
 								    # 根据 type 获取对应的 all_possible_keys
 								    chosen_keys = type_to_keys_map.get(type, [])
 								    another_keys_list = type_to_keys_map.get(3 - type, [])  # 3 - type 将 type 1 映射到 2，反之亦然
 								    another_keys_str = ', '.join([f"'{key}'" for key in another_keys_list])
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # 处理互斥关系：如果 "技术要求" 和 "服务要求" 同时存在，则移除 "技术、服务要求"
 								    if "技术要求" in keys and "服务要求" in keys:
 								        keys.discard("技术、服务要求")
 								    # 如果 "技术、服务要求" 存在，则移除 "技术要求" 和 "服务要求"
 								    elif "技术、服务要求" in keys:
 								        keys.discard("技术要求")
 								        keys.discard("服务要求")
 								    # 确保 keys 中只包含允许的键
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    keys = keys.intersection(chosen_keys)
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # 按照预定义的顺序排序键，以保持一致性
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    sorted_keys = [key for key in chosen_keys if key in keys]
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    # 如果没有任何键被选中，返回""
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    if not sorted_keys:
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								        return ""
 								    # 生成模板的通用部分
 								    def generate_prompt_instruction(keys_str, outer_keys_str, another_keys_str, type):
 								        if type == 1:
 								            specific_instructions = textwrap.dedent(
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								                """4. 若章节开头位置或者采购清单中除了需要采购的货物、数量、单位之外，还有带三角▲或五角星★的描述内容（如工期要求、进度要求、品牌要求等商务要求），也请将该部分内容提取出来，添加在外层键名为'商务要求'的键值部分；若存在标题包含'工期要求'、'进度要求'等和商务要求有关的关键字，也请将该标题下的内容提取，添加在外层键名为'商务要求'的键值部分。请不要遗漏这部分的'商务要求'。
 . 在提取'服务要求'的时候，若原文（包含正文和表格）中存在'安装要求'、'售后要求'、'维护要求'、'培训要求'等服务相关的要求说明，请添加至'服务要求'的键值部分，不要遗漏这部分的'服务要求'。
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								                """
 								            )
 								        else:
-.27 清理file_id

											
										
										
											2024-11-28 11:57:32 +08:00
+								            specific_instructions = textwrap.dedent(
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								                """4. 在提取技术要求或技术、服务要求时，你无需从采购清单或表格中提取技术要求以及参数要求，你仅需定位到原文中包含'技术要求'或'技术、服务要求'关键字的标题并提取其后相关内容，从正文中提取；若技术要求或技术服务要求的内容全在表格中，键值为空列表[]。
 . 在提取'技术要求'时，注意不要提取有关'安装、售后、维护、运维、培训、质保'等要求，它们不属于'技术要求'，但是若大标题中包含'总体要求''建设要求'等和技术要求相关的关键字，请添加到'技术要求'的键值部分。
-.27 清理file_id

											
										
										
											2024-11-28 11:57:32 +08:00
+								                """
 								            )
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								        return textwrap.dedent(
-.2 清理bug修改

											
										
										
											2024-12-03 11:50:15 +08:00
+								            f"""请你根据该货物类招标文件中的采购要求部分内容，请告诉我该项目采购的{keys_str}分别是什么，请以json格式返回结果，默认情况下外层键名是{outer_keys_str}，键值为字符串列表，每个字符串表示具体的一条要求，可以按原文中的序号作划分（若有序号的话），请按原文内容回答，保留三角▲、五角星★和序号（若有），不要擅自增删内容。
 								    注意事项：
 . 若相应要求下存在子标题表示子要求因素但不具备实际的含义、要求，可以将它忽略而不是将它与下文具体要求进行多行合并，或者作为该要求下的嵌套键名，总之字符串列表中只提取具体的要求。
 . 请不要提取{another_keys_str}中的内容。
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
 								    要求与指南：
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+. JSON 的结构要求：
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								        - 默认情况无需嵌套键值对，键值为字符串列表；若存在嵌套结构(即有明确标题表示各子要求)，则嵌套键名是各子要求，嵌套键值为字符串列表，为该子要求下的若干要求，最多一层嵌套。
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        - 每个外层键对应的值可以是：
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								            a. 一个字符串列表，表示具体的一条条要求。若只有一条要求，也用字符串列表表示。
 								            b. 一个对象（字典），其键为子因素名，值为字符串列表。
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								            c. 若文档中无符合的相关要求，键值为空列表[]
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        - 最多只允许一层嵌套。
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+. 请优先且准确定位正文部分包含以下关键字的标题：{outer_keys_str}，在其之后提取'XX要求'相关内容，尽量避免在无关地方提取内容。
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+. 注意请不要返回Markdown表格语法，必要时使用冒号':'将相关信息拼接在一起
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    {specific_instructions}
-.2 清理bug修改

											
										
										
											2024-12-03 11:50:15 +08:00
+. 字符串列表中的每个字符串内容需与原文内容保持一致，保留前面的三角▲、五角星★和序号（如果有），但不可以擅自添加这些内容。
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    """)
 								    # 过滤示例内容
 								    def filter_example_content(example_content, keys):
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								        return {k: v for k, v in example_content.items() if k in keys}
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    def format_example(example_content):
 								        return json.dumps(example_content, indent=4, ensure_ascii=False)
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    filtered_example_content1 = filter_example_content(example_content1, sorted_keys)
 								    filtered_example_content2 = filter_example_content(example_content2, sorted_keys)
 								    tech_json_example1_str = format_example(filtered_example_content1)
 								    tech_json_example2_str = format_example(filtered_example_content2)
 								    keys_str = '、'.join(sorted_keys)
 								    outer_keys_str = ', '.join([f"'{key}'" for key in sorted_keys])
 								    prompt_instruction = generate_prompt_instruction(keys_str, outer_keys_str, another_keys_str, type)
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # 完整的用户查询模板，包含两份示例输出
 								    user_query_template = f"""
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    {prompt_instruction}
 								    以下为示例输出，仅供格式参考：
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								    示例 1,无嵌套键值对：
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    {tech_json_example1_str}
-.29 优化提示词

											
										
										
											2024-11-29 15:02:07 +08:00
+								    示例 2，嵌套键值对形式：
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    {tech_json_example2_str}
 								    """
-.12 豆包测试

											
										
										
											2024-12-12 10:56:46 +08:00
+								    if full_text:
 								        user_query_template += f"\n\n文件内容：{full_text}"
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    return user_query_template
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
 								def get_business_requirements(procurement_path, processed_filepath, model_type):
 								    required_keys = ["技\s*术\s*要\s*求", "商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求",
 								                     "总\s*体\s*要\s*求", "建\s*设\s*要\s*求", "进\s*度\s*要\s*求", "工\s*期\s*要\s*求",
 								                     "质\s*保\s*要\s*求", "培\s*训\s*要\s*求", "售\s*后\s*要\s*求"]
 								    # 将 doc/docx 转换为 pdf
 								    procurement_pdf_path = procurement_path
-.18 截取pdf

											
										
										
											2024-12-18 12:26:27 +08:00
+								    if procurement_path.lower().endswith(('.doc', '.docx')):
 								        procurement_pdf_path = docx2pdf(procurement_path)
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
 								    # 查找包含的关键词
-.17 无效投标、废标

											
										
										
											2024-12-17 15:42:20 +08:00
+								    contained_keys = find_exists(procurement_pdf_path, required_keys)
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    print(contained_keys)
-.24 速率限制

											
										
										
											2024-11-25 09:15:56 +08:00
+								    if not contained_keys:
 								        return {}
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
 								    # 读取文件全文
-.12 豆包测试

											
										
										
											2024-12-12 10:56:46 +08:00
+								    full_text = read_txt_to_string(processed_filepath)
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
 								    # 生成业务查询和技术查询
-.12 豆包测试

											
										
										
											2024-12-12 10:56:46 +08:00
+								    busi_user_query = generate_template(contained_keys, full_text, 1)
 								    tech_user_query = generate_template(contained_keys, full_text, 2)
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
 								    # 初始化结果存储
 								    final_res = {}
 								    # 如果是非模型调用，需要提前上传文件并获取 file_id
 								    file_id = None
 								    if not model_type:
 								        procurement_docx_path=procurement_path
 								        if procurement_path.lower().endswith('.pdf'):
 								            procurement_docx_path = pdf2docx(procurement_path)
 								        file_id = upload_file(procurement_docx_path)  # 只上传一次文件，避免冗余调用
 								    # 并行处理业务和技术查询
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
 								        futures = []
 								        if busi_user_query:
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
+								            if model_type:
 								                # 如果是模型调用，直接使用 doubao_model
 								                futures.append(executor.submit(doubao_model, busi_user_query))
 								            else:
 								                # 使用 qianwen_long_stream 并传入 file_id
 								                futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1))
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								        if tech_user_query:
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
+								            if model_type:
 								                # 如果是模型调用，直接使用 doubao_model
 								                futures.append(executor.submit(doubao_model, tech_user_query))
 								            else:
 								                # 使用 qianwen_long_stream 并传入 file_id
 								                futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1))
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								        # 获取结果
 								        for future in concurrent.futures.as_completed(futures):
 								            try:
 								                result = future.result()
 								                if result:  # 确保结果不为空
 								                    final_res.update(clean_json_string(result))
 								            except Exception as e:
 								                print(f"An error occurred: {e}")
-.19 invalid_path转md格式前增加判断

											
										
										
											2024-12-19 15:12:42 +08:00
-.23测试分段

											
										
										
											2024-09-23 15:49:30 +08:00
+								    return final_res
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
 								# TODO:改为先判断，再摘取
-.18

											
										
										
											2024-09-18 11:57:17 +08:00
+								if __name__ == "__main__":
-.8 评标修改 技术参数修改

											
										
										
											2024-11-08 20:34:57 +08:00
+								    # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
-.18 截取pdf

											
										
										
											2024-12-18 12:26:27 +08:00
+								    # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件（广水市教育局封闭管理）_procurement.pdf"
 								    procurement_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
 								    docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
-.20 修改bug

											
										
										
											2024-11-20 15:44:05 +08:00
+								    # file_id = upload_file(truncate_file)
-.18 截取pdf

											
										
										
											2024-12-18 12:26:27 +08:00
+								    # processed_filepath = pdf2txt(procurement_path)
 								    processed_filepath=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\extract1.txt'
 								    final_res= get_business_requirements(procurement_path,processed_filepath)
-.27 优化商务要求提取的提示词

											
										
										
											2024-11-27 16:58:25 +08:00
+								    print(json.dumps(final_res, ensure_ascii=False, indent=4))