11.28 清理file_id

2024-11-28 13:38:17 +08:00 · 2024-11-28 13:38:17 +08:00 · 493e8abf31
commit 493e8abf31
parent 4b4937780e
19 changed files with 3097 additions and 0 deletions
--- a/flask_app/old_version/不得存在及禁止投标情形_old.py
+++ b/flask_app/old_version/不得存在及禁止投标情形_old.py
@ -0,0 +1,172 @@
 import json
 import os
 import re
 from PyPDF2 import PdfWriter, PdfReader
 from flask_app.general.通义千问long import upload_file, qianwen_long
 from flask_app.general.通用功能函数 import process_string_list
 def extract_and_format_from_paths(json_paths, includes, excludes):
    """
    从多个 JSON 文件路径读取数据，提取包含特定关键词的内容，并按照格式要求合并。
    参数:
    json_paths (list): 包含多个 JSON 文件路径的列表。
    includes (list): 包含要检查的关键词的列表。
    excludes (list): 包含要排除的关键词的列表。
    返回:
    list: 包含所有文件中满足条件的格式化字符串列表。
    """
    all_formatted_results = []
    # 遍历每个文件路径
    for path in json_paths:
        # 检查路径是否为空或不存在
        if not path or not os.path.isfile(path):
            print(f"禁止投标情形: Error: The file path '{path}' is invalid or does not exist.")
            continue  # 跳过无效路径
        try:
            with open(path, 'r', encoding='utf-8') as file:
                # 加载 JSON 数据
                json_data = json.load(file)
                formatted_results = []
                # 遍历 JSON 数据的每个键值对
                for key, value in json_data.items():
                    if isinstance(value, dict):
                        # 如果值是字典，检查嵌套字典的每个键值对
                        for sub_key, sub_value in value.items():
                            if any(include in sub_key for include in includes):
                                # 如果子值包含关键词，格式化并添加到结果列表
                                formatted_results.append(f"{sub_value}")
                    elif isinstance(value, str):  # clause
                        # 检查是否包含任何 include 关键词
                        for include in includes:
                            if include in value:
                                # 找到 include 之前的内容
                                prefix = value.split(include)[0]
                                # 检查 prefix 是否不包含任何 exclude 关键词
                                if not any(exclude in prefix for exclude in excludes):
                                    # 如果不包含任何 exclude 关键词，添加整个 value 到结果列表
                                    if '\n' in value:
                                        value = value.split('\n', 1)[-1]
                                    formatted_results.append(value)
                                    break  # 找到一个符合条件的就跳出循环
                # 将当前文件的结果添加到总结果列表
                all_formatted_results.extend(formatted_results)
        except FileNotFoundError:
            print(f"禁止投标情形: Error: The file '{path}' does not exist.")
        except json.JSONDecodeError:
            print(f"禁止投标情形: Error: The file '{path}' contains invalid JSON.")
    return all_formatted_results
 def extract_unique_items_from_texts(texts):
    # 更新正则表达式以包括更广泛的序号类型，包括中文序号
    pattern = re.compile(r'(?:\d+\.|\（\d+\）|\(\d+\)|\d+\)|\①|\②|\③|\④|\⑤|\⑥|\⑦|\⑧|\⑨|\⑩|\⑪|\⑫)\s*')
    intro_pattern = re.compile(r'^.*?[:：]')
    punctuation_pattern = re.compile(r'[；。，、．.,:;!?！？]+$')
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    content_check_pattern = re.compile(r'[\u4e00-\u9fa5a-zA-Z0-9]{2,}')  # 检查是否含有至少2个连续的字母、数字或汉字
    all_results = []
    seen = set()
    for text in texts:
        # 去除文本中的制表符和换行符
        text = text.replace('\t', '').replace('\n', '')
        # 删除引导性的文本（直到冒号，但保留冒号后的内容）
        text = intro_pattern.sub('', text)
        # 替换URL为占位符，并保存URL以便后续还原
        urls = []
        def url_replacer(match):
            urls.append(match.group(0))
            return f"{{URL{len(urls)}}}"
        text = url_pattern.sub(url_replacer, text)
        # 使用数字和括号的模式分割文本
        items = pattern.split(text)
        for item in items:
            cleaned_item = item.strip()
            if cleaned_item:
                # 进一步清理每个条目
                cleaned_item = pattern.sub('', cleaned_item)
                cleaned_item = punctuation_pattern.sub('', cleaned_item)
                cleaned_item = cleaned_item.strip()
                # 还原URL
                for i, url in enumerate(urls, 1):
                    cleaned_item = cleaned_item.replace(f"{{URL{i}}}", url)
                # 添加未见过的独特条目，确保它包含足够的实质内容并长度大于3个字符
                if cleaned_item and cleaned_item not in seen and len(cleaned_item) > 3 and content_check_pattern.search(cleaned_item):
                    seen.add(cleaned_item)
                    all_results.append(cleaned_item)
    return all_results
 def merge_pdfs(paths, output_filename):
    pdf_writer = PdfWriter()
    output_path = None
    for path in paths:
        pdf_reader = PdfReader(path)
        for page in range(len(pdf_reader.pages)):
            # 将每页添加到写入对象中
            pdf_writer.add_page(pdf_reader.pages[page])
        if output_path is None:
            # 确定输出文件路径
            output_path = os.path.join(os.path.dirname(path), output_filename)
    # 写入合并的PDF到文件
    if output_path:
        with open(output_path, 'wb') as out:
            pdf_writer.write(out)
        print(f"禁止投标情形: Merged PDF saved to {output_path}")
    else:
        print("禁止投标情形: No files to merge.")
    return output_path
 def find_forbidden(qualification=""):
    try:
        if qualification:
            file_id = upload_file(qualification)
            user_query_forbidden = (
                "该招标文件规定的投标人不得存在的其他情形有哪些，请以列表给我提供信息，形如[xx,xx,...],"
                "请你不要回答有关\"信誉要求\"的内容,若原文未提及，返回[]。"
            )
            qianwen_forbidden_str = qianwen_long(file_id, user_query_forbidden)
        else:
            qianwen_forbidden_str = "[]"
        actual_list = process_string_list(qianwen_forbidden_str)  # 提取出字符串列表 ["xxx","xx"]
        includes = ["不得存在", "不得与", "禁止投标", "对投标人的纪律"]
        excludes = ["招标", "评标", "定标"]
        # forbidden_results = extract_and_format_from_paths([truncate_json_path, clause_path], includes, excludes)
        # processed_results = extract_unique_items_from_texts(forbidden_results)
        # merged_forbidden_list = list(dict.fromkeys(actual_list + processed_results))
        merged_forbidden_list = list(dict.fromkeys(actual_list))
        forbidden_dict = {'不得存在的其他情形': merged_forbidden_list}
        return forbidden_dict
    except Exception as e:
        print(f"find_forbidden 在处理时发生异常: {e}")
        return {'不得存在的其他情形': ""}
 #TODO:不得存在的情况文中有很多内容,货物标中采用了全文搜索的逻辑。
 if __name__ == '__main__':
    # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\truncate_output.json"
    # clause_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
    qualification = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
    output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f"
    doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile.docx'
    res = find_forbidden(qualification)
    print(json.dumps(res, ensure_ascii=False, indent=4))
--- a/flask_app/old_version/判断截取位置_old.py
+++ b/flask_app/old_version/判断截取位置_old.py
@ -0,0 +1,266 @@
 # -*- encoding:utf-8 -*-
 import json
 import re
 import os
 import sys
 from pathlib import Path
 from openai import OpenAI
 import concurrent.futures
 import queue
 import time
 def qianwen_long(file_id, user_query):
    print("call qianwen-long...")
    """
    Uses a previously uploaded file to generate a response based on a user query.
    """
    client = OpenAI(
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
    )
    # Generate a response based on the file ID
    completion = client.chat.completions.create(
        model="qwen-long",
        top_p=0.5,
        temperature=0.4,
        messages=[
            {
                'role': 'system',
                'content': f'fileid://{file_id}'
            },
            {
                'role': 'user',
                'content': user_query
            }
        ],
        stream=False
    )
    # Return the response content
    return completion.choices[0].message.content
 def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type):
    if llm_type==2:
        print(f"qianwen_long! question:{question}")
        qianwen_res = qianwen_long(file_id,question)
        result_queue.put((ans_index,(question,qianwen_res)))
        return
 def multi_threading(queries, knowledge_name="", file_id="", llm_type=1):
    if not queries:
        return []
    print("多线程提问：starting multi_threading...")
    result_queue = queue.Queue()
    # 使用 ThreadPoolExecutor 管理线程
    with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
        # 逐个提交任务，每提交一个任务后休眠1秒
        future_to_query = {}
        for index, query in enumerate(queries):
            future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type)
            future_to_query[future] = index
            time.sleep(1)  # 每提交一个任务后等待1秒
        # 收集每个线程的结果
        for future in concurrent.futures.as_completed(future_to_query):
            index = future_to_query[future]
            try:
                future.result()  # 捕获异常或确认任务完成
            except Exception as exc:
                print(f"Query {index} generated an exception: {exc}")
                # 确保在异常情况下也向 result_queue 添加占位符
                result_queue.put((index, None))
    # 从队列中获取所有结果并按索引排序
    results = [None] * len(queries)
    while not result_queue.empty():
        index, result = result_queue.get()
        results[index] = result
    # 检查是否所有结果都是 None
    if all(result is None for result in results):
        return []
        # 过滤掉None值
    results = [r for r in results if r is not None]
    # 返回一个保证是列表的结构
    return results
 def upload_file(file_path):
    """
    Uploads a file to DashScope and returns the file ID.
    """
    client = OpenAI(
        api_key=os.getenv("DASHSCOPE_API_KEY"),
        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
    )
    file = client.files.create(file=Path(file_path), purpose="file-extract")
    return file.id
 def load_data(json_path):
    """
    从指定的JSON文件中加载数据。
    Args:
        json_path (str): JSON文件的路径。
    Returns:
        dict: 加载的JSON数据字典。
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"错误：文件未找到 - {json_path}")
        sys.exit(1)
    except json.JSONDecodeError as e:
        print(f"错误：解析JSON文件时出错 - {e}")
        sys.exit(1)
 def define_target_names():
    """
    定义目标名称列表。
    Returns:
        list: 目标名称列表。
    """
    return [
        "营业执照",
        # "开户信息",
        "法定代表人身份证",
        # "法定代表人授权人身份证",
        "人员证书",
        "人员社保资料",
        # "劳动合同",
        "企业证书",
        "企业业绩",
        "财务审计报告",
        "缴纳税收证明",
        "公司缴纳社保证明"
    ]
 def generate_user_query(target, chapters, keywords):
    """
    根据目标、章节和关键词生成用户查询模板。
    Args:
        target (str): 目标名称。
        chapters (list): 相关章节列表。
        keywords (list): 相关关键词列表。
    Returns:
        str: 生成的用户查询字符串。
    """
    template3 = f"""这是投标文件模板，作为投标人，我需要把不同的投标材料填充到对应位置，请你根据该文件回答：{target}应该插入在该文件哪个地方？你可能需要查找以下关键词出现的地方：{', '.join([f"'{kw}'" for kw in keywords])}，并确认插入的位置。我已在原文中打上若干待插入位置的标记，形如'[$$第5个待插入位置$$]'，它的标记与它上面的小节内容关联。你需要返回给我{target}应该插入位置的标记序号，即'[$$第5个待插入位置$$]'中的'5'，而不是页码，若有多个位置需要插入，可以返回多个序号，你的回答以数组返回，如[17, 19]，若插入位置不明确，那么返回[-1]。
 """
    template2 = f"""你是一个专业撰写投标文件的专家，这是投标文件模板，作为投标人，你需要把不同的投标材料填充到对应位置，请你根据该文件回答：{target}应该插入在该文件哪个地方？你可能需要查找以下关键词出现的地方：{', '.join([f"'{kw}'" for kw in keywords])}，或者根据文中'备注'或'附'内的信息确认插入的位置。目前原文中各章节末尾已打上待插入位置的标记，标记格式为'[$$当前章节名：第x个待插入位置$$]'形如'[$$四、投标保证金：第4个待插入位置$$]'，每个标记与它紧跟着的上面的这一章内容关联。你需要返回给我{target}应该插入位置的标记序号，即'[$$四、投标保证金：第4个待插入位置$$]'中的'4'，而不是当前页码，若有多个位置需要插入，可以返回多个序号，你的回答以数组返回，如[4, 5]，若插入位置不明确，那么返回[-1]。
        """
    template1 = f"""这是投标文件模板，作为投标人，我需要把不同的投标材料填充到对应位置，请你根据该文件回答：{target}应该插入在该文件哪个地方？你可能需要查找以下关键词出现的地方：{', '.join([f"'{kw}'" for kw in keywords])}，并确认插入的位置。我已在原文中各章节末尾打上待插入位置的标记，标记格式为'[$$当前章节名：第x个待插入位置$$]'形如'[$$四、投标保证金：第4个待插入位置$$]'，每个标记与它紧跟着的上面的这一章内容关联。你需要返回给我{target}应该插入位置的标记数字序号，即'[$$四、投标保证金：第4个待插入位置$$]'中的'4'，而不是当前页码，若有多个位置需要插入，可以返回多个序号，你的回答以数组返回，如[4, 5]，若插入位置不明确，那么返回[-1]。
    """
    return template1
 def generate_user_queries(target_names, data_dict):
    """
    为每个目标生成对应的用户查询。
    Args:
        target_names (list): 目标名称列表。
        data_dict (dict): 数据字典。
    Returns:
        list: 包含目标和查询的字典列表。
    """
    user_queries = []
    for target in target_names:
        if target in data_dict:
            chapters = data_dict[target].get("章节", [])
            keywords = data_dict[target].get("关键字", [])
            query = generate_user_query(target, chapters, keywords)
            user_queries.append({
                "target": target,
                "query": query
            })
        else:
            print(f"警告：'{target}'未在数据字典中找到相关信息。")
    return user_queries
 def process_string_list(string_list):
    """
    处理字符串列表，提取方括号内的内容并转换为实际列表。
    Args:
        string_list (str): 包含方括号的字符串。
    Returns:
        list: 解析后的列表内容。
    """
    match = re.search(r'\[(.*?)\]', string_list)
    if match:
        content_inside = match.group(1).strip()
        if content_inside:
            items = [item.strip() for item in content_inside.split(',')]
            if all(item.isdigit() for item in items):
                formatted_list = [int(item) for item in items]
            else:
                formatted_list = items
            return formatted_list
    return []
 def main():
    # 定义JSON文件路径
    # json_path = "flask_app/general/static/插入位置.json"
    json_path = "/flask_app/general/static/插入位置.json"
    # 加载数据
    data_dict = load_data(json_path)
    # 定义目标名称
    target_names = define_target_names()
    # 生成用户查询列表
    user_query_list = generate_user_queries(target_names, data_dict)
    if not user_query_list:
        print("没有生成任何用户查询。")
        sys.exit(0)
    # 提取查询
    queries = [item['query'] for item in user_query_list]
    # 定义文件路径
    format_part = "C:\\Users\\Administrator\\Desktop\\bid_format.pdf"
    # format_part="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹\\bid_format.docx"
    # 检查文件是否存在
    if not os.path.isfile(format_part):
        print(f"错误：文件未找到 - {format_part}")
        sys.exit(1)
    # 上传文件并获取file_id
    file_id = upload_file(format_part)
    if not file_id:
        print("错误：文件上传失败。")
        sys.exit(1)
    # 使用多线程并行处理查询
    results = multi_threading(queries, "", file_id,2)
    if not results:
        print("错误：未收到任何处理结果。")
        sys.exit(1)
    # 清理返回结果
    baseinfo_list = [process_string_list(res) for _, res in results]
    # 输出结果
    for i, info in enumerate(baseinfo_list):
        print(f'{target_names[i]}:{info}')
 if __name__ == "__main__":
    main()
--- a/flask_app/old_version/回答来源_old.py
+++ b/flask_app/old_version/回答来源_old.py
@ -0,0 +1,216 @@
 #基于多线程提问，现已废弃
 # assistant_id
 import queue
 import concurrent.futures
 from dashscope import Assistants, Messages, Runs, Threads
 from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
 from flask_app.general.json_utils import extract_content_from_json
 prompt = """
 # 角色
 你是一个文档处理专家，专门负责理解和操作基于特定内容的文档任务，这包括解析、总结、搜索或生成与给定文档相关的各类信息。
 ## 技能
 ### 技能 1：文档解析与摘要
 - 深入理解并分析${document1}的内容，提取关键信息。
 - 根据需求生成简洁明了的摘要，保持原文核心意义不变。
 ### 技能 2：信息检索与关联
 - 在${document1}中高效检索特定信息或关键词。
 - 能够识别并链接到文档内部或外部的相关内容，增强信息的连贯性和深度。
 ## 限制
 - 所有操作均需基于${document1}的内容，不可超出此范围创造信息。
 - 在处理敏感或机密信息时，需遵守严格的隐私和安全规定。
 - 确保所有生成或改编的内容逻辑连贯，无误导性信息。
 请注意，上述技能执行时将直接利用并参考${document1}的具体内容，以确保所有产出紧密相关且高质量。
 """
 prom = '请记住以下材料，他们对回答问题有帮助，请你简洁准确地给出回答，不要给出无关内容。${document1}'
 #正文和文档名之间的内容
 def extract_content_between_tags(text):
    results = []
    # 使用“【正文】”来分割文本
    parts = text.split('【正文】')[1:]  # 跳过第一个分割结果，因为它前面不会有内容
    for index, part in enumerate(parts):
        # 查找“【文档名】”标签的位置
        doc_name_index = part.find('【文档名】')
        # 查找 'file_ids' 标签的位置
        file_ids_index = part.find("'file_ids'")
        # 根据是否找到“【文档名】”来决定提取内容的截止点
        if doc_name_index != -1:
            end_index = doc_name_index
        elif file_ids_index != -1:
            end_index = file_ids_index
        else:
            end_index = len(part)
        # 提取内容
        content = part[:end_index].strip()
        results.append(content)
    # 如果存在 file_ids，处理最后一部分特别提取 file_ids 前的内容
    if "'file_ids'" in parts[-1]:
        file_ids_index = parts[-1].find("'file_ids'")
        if file_ids_index != -1:
            last_content = parts[-1][:file_ids_index].strip()
            results[-1] = last_content  # 更新最后一部分的内容，确保只到 file_ids
    return results
 def find_references_in_extracted(formatted_ans, extracted_references):
    results = {}  # 用来存储匹配结果的字典
    # 递归函数，用于处理多层嵌套的字典
    def recurse_through_dict(current_dict, path=[]):
        for key, value in current_dict.items():
            # 检查值是否还是字典，如果是，进行递归
            if isinstance(value, dict):
                recurse_through_dict(value, path + [key])
            else:
                # 特定值处理：如果值为'未知'，直接设置索引为-1
                if value == '未知':
                    results['.'.join(path + [key])] = -1
                else:
                    # 进行匹配检查
                    found = False
                    for index, reference in enumerate(extracted_references):
                        if str(value) in reference:  # 转换为字符串，确保兼容性
                            results['.'.join(path + [key])] = index  # 使用点表示法记录路径
                            found = True
                            break
                    if not found:
                        results['.'.join(path + [key])] = None
    # 从根字典开始递归
    recurse_through_dict(formatted_ans)
    return results
 def send_message(assistant, message='百炼是什么？'):
    ans = []
    print(f"Query: {message}")
    # create thread.
    thread = Threads.create()
    print(thread)
    # create a message.
    message = Messages.create(thread.id, content=message)
    # create run
    run = Runs.create(thread.id, assistant_id=assistant.id)
    print(run)
    # wait for run completed or requires_action
    run_status = Runs.wait(run.id, thread_id=thread.id)
    print(run_status)
    reference_txt = str(run_status)
    extracted_references = extract_content_between_tags(reference_txt)     #引用的文章来源list
    # get the thread messages.
    msgs = Messages.list(thread.id)
    for message in msgs['data'][::-1]:
        ans.append(message['content'][0]['text']['value'])
    return ans,extracted_references
 def rag_assistant(knowledge_name):
    retriever = DashScopeCloudRetriever(knowledge_name)
    pipeline_id = str(retriever.pipeline_id)
    assistant = Assistants.create(
        model='qwen-max',
        name='smart helper',
        description='智能助手，支持知识库查询和插件调用。',
        temperature='0.3',
        instructions=prom,
        tools=[
            {
                "type": "code_interpreter"
            },
            {
                "type": "rag",
                "prompt_ra": {
                    "pipeline_id": pipeline_id,
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "query_word": {
                                "type": "str",
                                "value": "${document1}"
                            }
                        }
                    }
                }
            }]
    )
    return assistant
 def pure_assistant():
    assistant = Assistants.create(
        model='qwen-max',
        name='smart helper',
        description='智能助手，能基于用户的要求精准简洁地回答用户的提问',
        instructions='智能助手，能基于用户的要求精准简洁地回答用户的提问',
        tools=[
            {
                "type": "code_interpreter"
            },
        ]
    )
    return assistant
 def llm_call(question, knowledge_name, result_queue, ans_index, use_rag=True):
    if use_rag:
        assistant = rag_assistant(knowledge_name)
    else:
        assistant = pure_assistant()
    ans,extracted_references = send_message(assistant, message=question)
    for index, reference in enumerate(extracted_references, start=0):
        print(f"{index}. {reference}")
    formatted_ans=extract_content_from_json(ans[1])
    print(formatted_ans)
    results = find_references_in_extracted(formatted_ans, extracted_references)
    for key, index in results.items():
        print(f"{key}: Found at index {index}")
    result_queue.put((ans_index, (question, ans)))  # 在队列中添加索引 (question, ans)
 def multi_threading(queries, knowledge_name, use_rag=True):
    result_queue = queue.Queue()
    # 使用 ThreadPoolExecutor 管理线程
    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
        # 使用字典保存每个提交的任务的Future对象，以便按顺序访问结果
        future_to_query = {executor.submit(llm_call, query, knowledge_name, result_queue, index, use_rag): index for
                           index, query in enumerate(queries)}
        # 收集每个线程的结果
        for future in concurrent.futures.as_completed(future_to_query):
            index = future_to_query[future]
            # 由于 llm_call 函数本身会处理结果，这里只需要确保任务执行完成
            try:
                future.result()  # 可以用来捕获异常或确认任务完成
            except Exception as exc:
                print(f"Query {index} generated an exception: {exc}")
    # 从队列中获取所有结果并按索引排序
    results = [None] * len(queries)
    while not result_queue.empty():
        index, result = result_queue.get()
        results[index] = result
    return results
 if __name__ == "__main__":
    # 读取问题列表
    questions = ["该招标文件的工程概况（或项目概况）是？招标范围是？请按json格式给我提供信息，键名分别为'工程概况','招标范围'，若存在嵌套信息，嵌套内容键名以文件中对应字段命名，若存在未知信息，在对应的键值中填'未知'。"]
    knowledge_name = "招标解析5word"
    results = multi_threading(questions, knowledge_name, use_rag=True)
    # 打印结果
    for question, response in results:
        print(f"Question: {question}")
        print(f"Response: {response}")
--- a/flask_app/old_version/基础信息整合_old.py
+++ b/flask_app/old_version/基础信息整合_old.py
@ -0,0 +1,166 @@
 import json
 from flask_app.general.json_utils import clean_json_string, rename_outer_key
 from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 from flask_app.main.判断是否分包等 import judge_whether_main, read_questions_from_judge
 from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
 from flask_app.general.通义千问long import upload_file
 from flask_app.general.通用功能函数 import judge_consortium_bidding
 def aggregate_basic_info_engineering(baseinfo_list):
    """
    将基础信息列表中的数据进行合并和分类。
    参数：
    - baseinfo_list (list): 包含多个基础信息的列表。
    返回：
    - dict: 合并和分类后的基础信息字典。
    """
    key_groups = {
        "招标人/代理信息": ["招标人", "招标人联系方式", "招标代理机构", "招标代理机构联系方式"],
        "项目信息": ["项目名称", "招标编号", "项目概况", "招标范围", "招标控制价", "投标竞争下浮率"],
        "关键时间/内容": [
            "投标文件递交截止日期",
            "投标文件递交方式",
            "开标时间",
            "开标地点",
            "投标人要求澄清招标文件的截止时间",
            "投标有效期",
            "评标结果公示媒介"
        ],
        "保证金相关": ["质量保证金", "退还投标保证金"],
        "其他信息": [
            "重新招标、不再招标和终止招标",
            "投标费用承担",
            "招标代理服务费",
            "是否退还投标文件",
        ]
    }
    combined_data = {}
    relevant_keys_detected = set()
    # 合并所有基础信息并收集相关键
    for baseinfo in baseinfo_list:
        combined_data.update(baseinfo)
        relevant_keys_detected.update(baseinfo.keys())
    # 动态调整键组
    dynamic_key_handling(key_groups, relevant_keys_detected)
    # 创建一个副本以存储未分类的项目
    unclassified_items = {k: v for k, v in combined_data.items() if k not in [item for sublist in key_groups.values() for item in sublist]}
    # 按键组分类并嵌套
    for group_name, keys in key_groups.items():
        group_data = {key: combined_data.get(key, "未提供") for key in keys}
        combined_data[group_name] = group_data
        # 从 unclassified_items 中移除已分类的键
        for key in keys:
            unclassified_items.pop(key, None)
    # 将剩余未分类的键值对添加到 "其他信息" 组
    combined_data["其他信息"].update(unclassified_items)
    # 移除顶层的未分类键值对
    for key in list(combined_data.keys()):
        if key not in key_groups:
            del combined_data[key]
    return combined_data
 def dynamic_key_handling(key_groups, detected_keys):
    # 检查和调整键组配置
    for key in detected_keys:
        # 处理“保证金相关”组,插到"质量保证金"前
        if "保证金" in key:
            group = key_groups["保证金相关"]
            insert_before = "质量保证金"
            if insert_before in group:
                index = group.index(insert_before)
                if key not in group:  # 避免重复插入
                    group.insert(index, key)
            else:
                group.append(key)  # 如果没有找到特定键，则追加到末尾
        elif "联合体" in key:
            key_groups["项目信息"].append(key)
        elif "分包" in key:
            key_groups["项目信息"].append(key)
        elif "踏勘现场" in key:
            key_groups["其他信息"].append(key)
        elif "投标预备会" in key:
            key_groups["其他信息"].append(key)
        elif "偏离" in key:
            key_groups["其他信息"].append(key)
 def combine_basic_info(knowledge_name, truncate0, output_folder, clause_path):
    """
    综合和处理基础信息，生成最终的基础信息字典。
    参数：
    - knowledge_name (str): 知识名称。
    - truncate0 (str): 文件路径。
    - output_folder (str): 输出文件夹路径。
    - clause_path (str): 条款路径。
    返回：
    - dict: 综合后的基础信息。
    """
    baseinfo_list = []
    baseinfo_file_path = 'flask_app/static/提示词/基本信息工程标.txt'
    # baseinfo_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\基本信息工程标.txt'
    questions = read_questions_from_file(baseinfo_file_path)
    res1 = multi_threading(questions, knowledge_name)
    for index, response in res1:
        try:
            if response and len(response) > 1:
                baseinfo_list.append(clean_json_string(response[1]))
            else:
                print(f"基础信息整合： Warning: Missing or incomplete response data for query index {index}.")
        except Exception as e:
            print(f"基础信息整合： Error processing response for query index {index}: {e}")
    # 判断是否分包、是否需要递交投标保证金等
    chosen_numbers, merged = judge_whether_main(truncate0, output_folder)
    baseinfo_list.append(merged)
    judge_file_path = 'flask_app/static/提示词/是否相关问题.txt'
    # judge_file_path = 'D:\\flask_project\\flask_app\\static\\提示词\\是否相关问题.txt'
    judge_questions = read_questions_from_judge(judge_file_path, chosen_numbers)
    judge_consortium = judge_consortium_bidding(baseinfo_list)  # 通过招标公告判断是否接受联合体投标
    if judge_consortium:
        judge_consortium_question = (
            "该招标文件对于联合体投标的要求是怎样的，请按json格式给我提供信息，"
            "外层键名为'联合体投标要求'，其中有一个嵌套键值对为：\"是否接受联合体投标\":\"是\""
        )
        judge_questions.append(judge_consortium_question)
    file_id = upload_file(truncate0)
    res2 = multi_threading(judge_questions, "", file_id, 2)  # 调用千问-long
    if not res2:
        print("基础信息整合： multi_threading error!")
    else:
        for question, response in res2:
            baseinfo_list.append(clean_json_string(response))
    rebidding_situation = extract_from_notice(clause_path, 3)  # "重新招标, 不再招标和终止招标"需从投标人须知正文提取
    update_json = rename_outer_key(rebidding_situation, "重新招标、不再招标和终止招标")
    baseinfo_list.append(update_json)
    aggregated_baseinfo = aggregate_basic_info_engineering(baseinfo_list)  # 现在是一个字典
    return {"基础信息": aggregated_baseinfo}
 if __name__ == "__main__":
    knowledge_name = "ztb"
    output_folder="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405"
    truncate0="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\ztbfile_tobidders_notice_table.pdf"
    clause_path="C:\\Users\Administrator\Desktop\\fsdownload\\3424b7cb-1f85-44b4-a432-44539b870405\\clause1.json"
    res=combine_basic_info(knowledge_name,truncate0,output_folder,clause_path)
    print(json.dumps(res,ensure_ascii=False,indent=4))
--- a/flask_app/old_version/多线程分类_old.py
+++ b/flask_app/old_version/多线程分类_old.py
@ -0,0 +1,373 @@
 # -*- coding: utf-8 -*-
 import os
 import shutil
 from PyPDF2 import PdfReader, PdfWriter
 def validate_pdf(file_path):
    """ 验证PDF文件是否损坏 """
    try:
        with open(file_path, "rb") as file:
            pdf = PdfReader(file)
            return len(pdf.pages) > 0
    except Exception as e:
        print(f"Error reading PDF {file_path}: {str(e)}")
        return False
 def truncate_pdf(source_path, target_path, max_pages=15):
    """截取PDF文件的前15页并保存"""
    try:
        with open(source_path, "rb") as infile:
            reader = PdfReader(infile)
            writer = PdfWriter()
            for i in range(min(max_pages, len(reader.pages))):
                writer.add_page(reader.pages[i])
            with open(target_path, "wb") as outfile:
                writer.write(outfile)
    except Exception as e:
        print(f"Error processing PDF {source_path}: {str(e)}")
 def copy_file(src, dest):
    """ 复制单个文件 """
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    shutil.copy2(src, dest)
 def copy_directory(source, destination):
    """复制整个目录"""
    os.makedirs(destination, exist_ok=True)
    for item in os.listdir(source):
        src_path = os.path.join(source, item)
        dest_path = os.path.join(destination, item)
        if os.path.isfile(src_path):
            copy_file(src_path, dest_path)
        else:
            copy_directory(src_path, dest_path)
 def unique_file_name(base_path, file_name):
    counter = 1
    name_part, extension = os.path.splitext(file_name)
    new_file_name = file_name
    # 检查文件是否存在，若存在则修改文件名
    while os.path.exists(os.path.join(base_path, new_file_name)):
        new_file_name = f"{name_part}_{counter}{extension}"
        counter += 1
    return new_file_name
 def process_pdf_folders(source_dir, target_dir):
    """ 处理源目录中的PDF文件，并基于条件选择目标文件夹 """
    for root, dirs, files in os.walk(source_dir, topdown=False):
        for file in files:
            if file.lower().endswith('.pdf'):
                source_file_path = os.path.join(root, file)
                if validate_pdf(source_file_path):
                    relative_path = os.path.relpath(root, source_dir)
                    target_file_dir = os.path.join(target_dir, relative_path)
                    target_file_path = os.path.join(target_file_dir, file)
                    copy_file(source_file_path, target_file_path)
                else:
                    print(f"Deleted corrupt file: {source_file_path}")
        # 清除空目录
        if not os.listdir(root):
            os.rmdir(root)
 def classify_folders(source_dir, target_dir, project_index):
    """Classifies folders and processes files based on specific criteria."""
    temp_dir = os.path.join(source_dir, 'temp')
    os.makedirs(temp_dir, exist_ok=True)
    target_project_dir = None
    processed_dirs = set()  # Set to track processed directories
    for subdir in os.listdir(source_dir):
        subdir_path = os.path.join(source_dir, subdir)
        if not os.path.isdir(subdir_path):
            continue
        files = [f.lower() for f in os.listdir(subdir_path)]
        if 'zb.pdf' in files:
            target_project_dir, target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index)
            processed_dirs.add(subdir_path)  # Mark this directory as processed
        elif subdir.lower() == "输出文件":
            process_evaluation_files(subdir_path, target_project_dir)
            processed_dirs.add(subdir_path)  # Mark this directory as processed
    # Process remaining folders, skipping already processed ones
    process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name)
    #删除tmp目录
    # if os.path.exists(temp_dir):
    #     shutil.rmtree(temp_dir)
    if os.path.exists(source_dir):
        shutil.rmtree(source_dir)
 def process_tender_files(subdir_path, temp_dir, target_dir, project_index):
    """Processes tender files and returns the target project directory and updated index."""
    zb_path = os.path.join(subdir_path, "zb.pdf")
    truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf")
    truncate_pdf(zb_path, truncated_zb_path, 30)  # Truncate to the first 30 pages
    bot_response = file_parse(truncated_zb_path, project_index, 1)
    zb_response = extract_answer(bot_response[1], 1)
    zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-')        #用'-'代替'/' ，目录名不允许出现斜杠
    target_zb_name, category = zb_response[2] + ".pdf", zb_response[3]
    new_zb_path = os.path.join(subdir_path, target_zb_name)
    os.rename(zb_path, new_zb_path)
    target_category_dir = os.path.join(target_dir, category)
    target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1])
    copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹"))
    os.remove(truncated_zb_path)
    shutil.rmtree(subdir_path)
    return target_project_dir, zb_response[2]
 def process_evaluation_files(subdir_path, target_project_dir):
    """Processes evaluation folders."""
    copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹"))
    shutil.rmtree(subdir_path)
 def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name):
    """Processes remaining folders containing bid files."""
    target_tb_dir = os.path.join(target_project_dir, "投标文件夹")
    for subdir in os.listdir(source_dir):
        subdir_path = os.path.join(source_dir, subdir)
        if not os.path.isdir(subdir_path) or subdir_path in processed_dirs:
            continue  # Skip processed directories
        target_tbcom_dir = None  # Initialize outside the file loop
        for item in os.listdir(subdir_path):
            item_src_path = os.path.join(subdir_path, item)
            new_name = "truncate_" + item
            truncated_tb_path = os.path.join(temp_dir, new_name)
            truncate_pdf(item_src_path, truncated_tb_path)
            bot_response = file_parse(truncated_tb_path, project_index, 2)
            tb_response = extract_answer(bot_response[1], 2)
            if not tb_response:
                continue  # If there's no response, skip this file
            # Initialize target_tbcom_dir only once based on the first processed file
            if not target_tbcom_dir:
                target_tb_name, _ = tb_response
                if(target_tb_name != target_zb_name and target_tb_name != "未知"):
                    target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name)
                    print(target_tbcom_dir)
                    os.makedirs(target_tbcom_dir, exist_ok=True)
            tb_section = tb_response[1] + ".pdf"
            new_tb_path = os.path.join(subdir_path, tb_section)
            # 获取唯一文件名
            new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section))
            os.rename(item_src_path, new_tb_path)
            # Remove temporary truncated file
            os.remove(truncated_tb_path)
        # Copy the whole directory at once after all files have been processed and renamed
        if target_tbcom_dir:
            copy_directory(subdir_path, target_tbcom_dir)
            shutil.rmtree(subdir_path)  # Optionally remove the original directory after copying
 import re
 def extract_answer(input_string, type):
    # 使用正则表达式匹配所需的信息
    # 第一种模式：项目名称、项目编号、招标人、类别
    # 在每个字段值后都添加了\s*以忽略尾随的空格
    pattern1 = r"项目名称[:：]\s*(.*?)[;；]\s*项目编号[:：]\s*(.*?)[;；]\s*招标人[:：]\s*(.*?)[;；]\s*类别[:：]\s*([^。;；]*).*"
    # 第二种模式：投标人、类别
    pattern2 = r"投标人[:：]\s*(.*?)[;；]\s*类别[:：]\s*([^。;；]*).*"
    if type == 1:
        match = re.search(pattern1, input_string)
        if match:
            print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}")
            return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()]
        else:
            print("No match found for type 1.")
            return []
    elif type == 2:
        match = re.search(pattern2, input_string)
        if match:
            # 检查是否包含“投标函”，如果是则替换部分内容为“商务文件”
            part = match.group(2).strip()
            if "投标函" in part:
                part = "商务文件"
            return [match.group(1).strip(), part]
        else:
            print("No match found for type 2.")
            return []
 from llama_index.readers.dashscope.base import DashScopeParse
 from llama_index.readers.dashscope.utils import ResultType
 from llama_index.indices.managed.dashscope import DashScopeCloudIndex
 from dashscope import Assistants, Messages, Runs, Threads
 def send_message(assistant, index,documents,message='百炼是什么？'):
    print(f"Query: {message}")
    # create thread.
    # create a thread.
    thread = Threads.create()
    print(thread)
    # create a message.
    message = Messages.create(thread.id, content=message)
    # create run
    run = Runs.create(thread.id, assistant_id=assistant.id)
    print("run:" + str(run))
    # # get run statue
    run_status = Runs.get(run.id, thread_id=thread.id)
    # print(run_status)
    # wait for run completed or requires_action
    run_status = Runs.wait(run.id, thread_id=thread.id)
    # print(run_status)
    # if prompt input tool result, submit tool result.
    run_status = Runs.get(run.id, thread_id=thread.id)
    # print(run_status)
    # verify_status_code(run_status)
    # get the thread messages.
    msgs = Messages.list(thread.id)
    # print(msgs)
    # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4))
    ans = []
    print("运行结果:")
    for message in msgs['data'][::-1]:
        ans.append(message['content'][0]['text']['value'])
        print("content: ", message['content'][0]['text']['value'])
    print("\n")
    deleteFileFromKnowledge(index,documents)
    return ans
 def file_parse(filepath, knowledge_index, type):
    parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
    documents = parse.load_data(file_path=filepath)
    # 创建一个字典来映射index值到知识库名
    index_to_name = {
        0: "文件分类知识库0",
        1: "文件分类知识库1",
        2: "文件分类知识库2",
        3: "文件分类知识库3"
    }
    # 使用get方法获取对应的知识库名，如果index不存在，返回"默认文件分类知识库"
    cloud_index_name = index_to_name.get(knowledge_index, "0")
    index = DashScopeCloudIndex(cloud_index_name)
    index._insert(documents)
    retriever = index.as_retriever()
    pipeline_id = str(retriever.pipeline_id)
    assistant = Assistants.create(
        model='qwen-plus',
        name='smart helper',
        description='智能助手，支持知识库查询和插件调用。',
        instructions='请记住以下材料，他们对回答问题有帮助，请你简洁准确地给出回答，不要给出无关内容。${document1}',
        tools=[
            {
                "type": "code_interpreter"
            },
            {
                "type": "rag",
                "prompt_ra": {
                    "pipeline_id": pipeline_id,
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "query_word": {
                                "type": "str",
                                "value": "${document1}"
                            }
                        }
                    }
                }
            }]
    )
    questions1 = "这份招标文件的项目名称是什么？这份文件的招标编号是多少？这份文件的招标人是谁？这份招标文件属于以下哪类招标：服务类、工程类、还是货物类？你可以结合你对招投标的了解和以下内容：工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争；\
                     货物类招投标是指以货物作为采购对象的招标，业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方，包括原材料、产品、设备、电能和固态、液态、气态物体等；服务类招标又称为服务采购，指的是除了工程和货之外的其他招标投标活动，服物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\
                     请按下列键值对格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别：XXX‘"
    questions2 = "这份投标文件的投标人是谁？如果文件中未明确提及投标人，投标人以未知替代；这份文件属于哪类投标文件？你的回答仅限于以下三种：商务文件、技术文件、报价文件，不要给出其他回答。请按下列键值对格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’"
    if (type == 1):
        questions = questions1
    elif (type == 2):
        questions = questions2
    return send_message(assistant,index,documents, message=questions)
 def deleteFileFromKnowledge(index,documents):
    # 初始化一个列表来存储所有文档的 ID
    file_ids = []
    # 检查documents是否为列表且不为空
    if isinstance(documents, list) and documents:
        # 遍历每个文档
        for document in documents:
            # 使用属性访问方式获取每个文档的 id_
            # 确保 document 对象有一个名为 id_ 的属性
            file_id = getattr(document, 'id_', None)  # 使用 getattr 防止属性不存在时抛出异常
            if file_id:
                file_ids.append(file_id)  # 将 id 添加到列表中
    index.delete_ref_doc(file_ids)
 import concurrent.futures
 import logging
 # 配置日志，只输出到控制台
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 def process_directory(subdir_path, base_intermediate_directory, final_directory, project_index):
    logging.info(f"Starting to process directory: {subdir_path} with index {project_index}")
    # 为每个子目录创建一个专用的临时目录
    intermediate_directory = os.path.join(base_intermediate_directory, f"intermediate_{project_index}")
    os.makedirs(intermediate_directory, exist_ok=True)
    logging.info(f"Created intermediate directory: {intermediate_directory}")
    # 这里可以添加具体的目录处理函数，例如:
    process_pdf_folders(subdir_path, intermediate_directory)
    classify_folders(intermediate_directory, final_directory, project_index % 4)
    # 处理完毕后清理该目录
    shutil.rmtree(intermediate_directory)
    logging.info(f"Deleted intermediate directory: {intermediate_directory}")
 def main(base_directory, base_intermediate_directory, final_directory):
    os.makedirs(final_directory, exist_ok=True)
    logging.info(f"Final directory ensured at: {final_directory}")
    project_index = 0
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = []
        for subdir in os.listdir(base_directory):
            subdir_path = os.path.join(base_directory, subdir)
            if os.path.isdir(subdir_path):
                logging.info(f"Submitting job for directory: {subdir_path}")
                future = executor.submit(process_directory, subdir_path, base_intermediate_directory, final_directory, project_index)
                futures.append(future)
                project_index += 1
        for future in concurrent.futures.as_completed(futures):
            try:
                future.result()  # 如果执行没有抛出异常，完成该任务
            except Exception as e:
                logging.error(f"Thread resulted in an error: {e}")
 if __name__ == "__main__":
    base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
    base_intermediate_directory = 'D:\\tmp'
    final_directory = 'D:\\output'
    main(base_directory, base_intermediate_directory, final_directory)
--- a/flask_app/old_version/废标项_old.py
+++ b/flask_app/old_version/废标项_old.py
@ -0,0 +1,66 @@
 import fitz  # PyMuPDF
 import re
 def extract_text_with_keywords(pdf_path, keywords, follow_up_keywords):
    """
    提取PDF文档中包含特定关键词的段落，包括正文和表格。如果段落中包含特定的后续关键词，也提取其后的段落，直到遇到下一个相似的序列编号。
    :param pdf_path: PDF文件的路径。
    :param keywords: 包含关键词的列表。
    :param follow_up_keywords: 触发连续提取的关键词列表。
    :return: 包含关键词的段落列表。
    """
    doc = fitz.open(pdf_path)
    extracted_paragraphs = []
    continue_collecting = False
    current_section_pattern = None
    for page in doc:
        text_blocks = page.get_text("blocks")
        for index, block in enumerate(text_blocks):
            text = block[4].strip()  # Text content of the block
            if text == "":  # Skip empty lines
                continue
            if continue_collecting:
                if current_section_pattern and re.match(current_section_pattern, text):
                    continue_collecting = False
                else:
                    extracted_paragraphs.append(text)
            if any(keyword in text for keyword in keywords):
                extracted_paragraphs.append(text)
                if any(follow_up in text for follow_up in follow_up_keywords):
                    continue_collecting = True
                    section_number = re.match(r'(\d+(\.\d+)*)', text)
                    if section_number:
                        current_section_number = section_number.group(1)
                        base_section_number = current_section_number.rsplit('.', 1)[0]
                        current_section_pattern = re.compile(rf'^{re.escape(base_section_number)}\.\d+\b')
                    else:
                        found_next_number = False
                        for next_index in range(index + 1, len(text_blocks)):
                            next_text = text_blocks[next_index][4].strip()
                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)', next_text)
                            if next_section_number:
                                found_next_number = True
                                current_section_pattern = re.compile(rf'^{next_section_number.group(1)}\b')
                            elif found_next_number:
                                break
    doc.close()
    return extracted_paragraphs
 # Example usage
 doc_path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标02_invalid.docx'
 keywords = ['否决', '无效投标', '被拒绝', '予以拒绝']
 follow_up_keywords = ['情形之一']
 extracted_contents = extract_text_with_keywords(doc_path, keywords, follow_up_keywords)
 # Writing to file and handling duplicates
 output_file = "C:\\Users\\Administrator\\Desktop\\temp.txt"
 with open(output_file, 'w', encoding='utf-8') as file:
    for content in extracted_contents:
        file.write(content + '\n')
 # file_id = upload_file(output_file)
 # user_query="根据文本中的内容，请你回答，否决投标和拒绝投标的情况有哪些？由于文本中存在冗余且无关的信息，且文本中的序号是混乱的，请你重新编排答案返回给我，以json格式返回，你的回答仅限于原文内容但删去原有序号，请不要自己组织语言。"
 # res=qianwen_long(file_id,user_query)
 # print("Query Result:", res)
--- a/flask_app/old_version/招标文件解析_old.py
+++ b/flask_app/old_version/招标文件解析_old.py
@ -0,0 +1,305 @@
 # -*- encoding:utf-8 -*-
 import json
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor
 from flask_app.main.截取pdf import truncate_pdf_multiple
 from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
 from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 import concurrent.futures
 from flask_app.old_version.基础信息整合_old import combine_basic_info
 from flask_app.old_version.资格审查模块old_old import combine_review_standards
 from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards
 from flask_app.general.format_change import pdf2docx, docx2pdf
 from flask_app.general.docx截取docx import copy_docx
 def get_global_logger(unique_id):
    if unique_id is None:
        return logging.getLogger()  # 获取默认的日志器
    logger = logging.getLogger(unique_id)
    return logger
 logger=None
 # 创建全局线程池
 executor = ThreadPoolExecutor()
 def preprocess_files(output_folder, downloaded_file_path, file_type, unique_id):
    logger.info("starting 文件预处理...")
    logger.info("output_folder..." + output_folder)
    # 根据文件类型处理文件路径
    if file_type == 1:  # docx
        docx_path = downloaded_file_path
        pdf_path = docx2pdf(docx_path)  # 将docx转换为pdf以供后续处理
    elif file_type == 2:  # pdf
        pdf_path = downloaded_file_path
        docx_path = pdf2docx(pdf_path)  # 将pdf转换为docx以供上传到知识库
    else:
        logger.error("Unsupported file type provided. Preprocessing halted.")
        return None
    # # 异步上传知识库
    future_knowledge = executor.submit(addfileToKnowledge, docx_path, "招标解析" + unique_id)
    # 调用截取PDF多次
    truncate_files = truncate_pdf_multiple(pdf_path, output_folder)
    # 处理各个部分
    truncate0_docpath = pdf2docx(truncate_files[0])  # 投标人须知前附表转docx
    invalid_docpath = copy_docx(docx_path)  # docx截取无效标部分
    truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder)  # 投标人须知前附表docx->json
    truncate0 = truncate_files[0]
    truncate1 = truncate_files[1]
    truncate3 = truncate_files[3]
    merged_baseinfo_path=truncate_files[-1]
    clause_path = convert_clause_to_json(truncate_files[2], output_folder)  # 投标人须知正文条款pdf->json
    logger.info("文件预处理done")
    # 提前返回，不等待 future_knowledge 完成，返回包含 Future 对象
    return {
        'file_path': downloaded_file_path,
        'output_folder': output_folder,
        'truncate0': truncate0,
        'truncate1': truncate1,
        'truncate3': truncate3,
        'knowledge_future': future_knowledge,  # 返回 Future 对象
        'truncate0_jsonpath': truncate_jsonpath,
        'merged_baseinfo_path':merged_baseinfo_path,
        'clause_path': clause_path,
        'invalid_docpath': invalid_docpath
    }
 def post_processing(data,includes):
    # 初始化结果字典，预设'其他'分类为空字典
    result = {"其他": {}}
    # 遍历原始字典的每一个键值对
    for key, value in data.items():
        if key in includes:
            # 如果键在includes列表中，直接保留这个键值对
            result[key] = value
        else:
            # 如果键不在includes列表中，将这个键值对加入到'其他'分类中
            result["其他"][key] = value
    # 如果'其他'分类没有任何内容，可以选择删除这个键
    if not result["其他"]:
        del result["其他"]
    return result
 # 基本信息
 def fetch_project_basic_info(knowledge_name, truncate0, output_folder, clause_path):  # 投标人须知前附表
    logger.info("starting基础信息...")
    basic_res = combine_basic_info(knowledge_name, truncate0, output_folder, clause_path)
    logger.info("基础信息done")
    return basic_res
 # 形式、响应、资格评审
 def fetch_qualification_review(truncate1, truncate3, knowledge_name, truncate0_jsonpath, clause_path,input_file,output_folder):
    logger.info("starting资格审查...")
    review_standards_res = combine_review_standards(truncate1, truncate3, knowledge_name, truncate0_jsonpath,
                                                    clause_path,input_file,output_folder)
    logger.info("资格审查done")
    return review_standards_res
 # 评分细则 流式
 def fetch_evaluation_standards(truncate1):  # 评标办法前附表
    logger.info("starting 商务标和技术标...")
    # 获取评标办法前附表的字典结果
    evaluation_standards_res = combine_evaluation_standards(truncate1)
    # 获取技术标和商务标
    technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
    commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
    logger.info("商务标和技术标 done")
    # 返回将 "技术标" 和 "商务标" 包含在新的键中
    return {
        "technical_standards": technical_standards,
        "commercial_standards": commercial_standards
    }
 # def fetch_evaluation_standards(truncate1):  # 评标办法前附表
 #     logger.info("starting商务标技术标...")
 #     evaluation_standards_res = combine_evaluation_standards(truncate1)
 #     logger.info("商务标技术标done")
 #     return evaluation_standards_res
 # 无效、废标项解析
 def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
    # 废标项要求：千问
    logger.info("starting无效标与废标...")
    find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
    logger.info("无效标与废标done...")
    return find_invalid_res
 # 投标文件要求
 def fetch_bidding_documents_requirements(clause_path):
    logger.info("starting投标文件要求...")
    fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
    logger.info("投标文件要求done...")
    return {"投标文件要求":fetch_bidding_documents_requirements_json}
 # 开评定标流程
 def fetch_bid_opening(clause_path):
    logger.info("starting开评定标流程...")
    fetch_bid_opening_json = extract_from_notice(clause_path, 2)
    logger.info("开评定标流程done...")
    return {"开评定标流程":fetch_bid_opening_json}
 # def main_processing(output_folder, downloaded_file_path, file_type, unique_id):  # file_type=1->docx  file_type=2->pdf
 #     global logger
 #     logger = get_global_logger(unique_id)
 #     # Preprocess files and get necessary data paths and knowledge index
 #     processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
 #     if not processed_data:
 #         return ""
 #
 #     with concurrent.futures.ThreadPoolExecutor() as executor:
 #         # Submit all tasks to the executor
 #         futures = {
 #             'base_info': executor.submit(fetch_project_basic_info, processed_data['knowledge_name'],
 #                                          processed_data['truncate0'], output_folder,
 #                                          processed_data['clause_path']),
 #             'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
 #                                                 processed_data['truncate3'],
 #                                                 processed_data['knowledge_name'], processed_data['truncate0_jsonpath'],
 #                                                 processed_data['clause_path'],processed_data['file_path'],processed_data['output_folder']),
 #             'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
 #             'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
 #                                                     output_folder, processed_data['truncate0_jsonpath'],
 #                                                     processed_data['clause_path'], processed_data['truncate3']),
 #             'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements,
 #                                                               processed_data['clause_path']),
 #             'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
 #         }
 #
 #         comprehensive_responses = []
 #         # Collect results in the defined order
 #         for key in ['base_info', 'qualification_review', 'evaluation_standards', 'invalid_requirements',
 #                     'bidding_documents_requirements', 'opening_bid']:
 #             try:
 #                 # Wait for the future to complete and get the result
 #                 result = futures[key].result()
 #                 comprehensive_responses.append(result)
 #             except Exception as exc:
 #                 logger.error(f"Error processing {key}: {exc}")
 #     # 合并 JSON 结果
 #     combined_final_result = combine_json_results(comprehensive_responses)
 #     includes = ["基础信息", "资格审查", "商务标", "技术标", "无效标与废标项", "投标文件要求", "开评定标流程"]
 #     result = post_processing(combined_final_result, includes)
 #     modified_json = transform_json_values(result)
 #
 #     final_result_path = os.path.join(output_folder, "final_result.json")
 #     with open(final_result_path, 'w', encoding='utf-8') as file:
 #         json.dump(modified_json, file, ensure_ascii=False, indent=2)
 #         logger.info("final_result.json has been saved")
 #     deleteKnowledge(processed_data['knowledge_index'])
 #     return final_result_path
 #分段返回
 def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id):
    global logger
    logger = get_global_logger(unique_id)
    # 预处理文件，获取处理后的数据
    processed_data = preprocess_files(output_folder, downloaded_file_path, file_type, unique_id)
    if not processed_data:
        yield json.dumps({})  # 如果处理数据失败，返回空的 JSON
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # 立即启动不依赖 knowledge_name 和 index 的任务
        futures = {
            'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
                                                    output_folder, processed_data['truncate0_jsonpath'],
                                                    processed_data['clause_path'], processed_data['truncate3']),
            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']),
            'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
        }
        # 提前处理这些不依赖的任务，按完成顺序返回
        for future in concurrent.futures.as_completed(futures.values()):
            key = next(k for k, v in futures.items() if v == future)
            try:
                result = future.result()
                # 如果是 evaluation_standards，拆分技术标和商务标
                if key == 'evaluation_standards':
                    technical_standards = result["technical_standards"]
                    commercial_standards = result["commercial_standards"]
                    # 分别返回技术标和商务标
                    yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False)
                    yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False)
                else:
                    # 处理其他任务的结果
                    yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
            except Exception as exc:
                logger.error(f"Error processing {key}: {exc}")
                yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
        # 只有在需要 knowledge_name 和 index 时才等待 future_knowledge 完成
        try:
            knowledge_name = "招标解析" + unique_id
            index = processed_data['knowledge_future'].result()  # 阻塞等待知识库上传任务完成
            # 提交依赖 knowledge_name 和 index 的任务
            future_dependencies = {
                'base_info': executor.submit(fetch_project_basic_info, knowledge_name, processed_data['truncate0'],
                                             output_folder, processed_data['clause_path']),
                'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
                                                        processed_data['truncate3'], knowledge_name,
                                                        processed_data['truncate0_jsonpath'],
                                                        processed_data['clause_path'], processed_data['file_path'],
                                                        processed_data['output_folder']),
            }
            # 按完成顺序返回依赖任务的结果
            for future in concurrent.futures.as_completed(future_dependencies.values()):
                key = next(k for k, v in future_dependencies.items() if v == future)
                try:
                    result = future.result()
                    yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
                except Exception as exc:
                    logger.error(f"Error processing {key}: {exc}")
                    yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
        except Exception as e:
            logger.error(f"Error uploading to knowledge base: {e}")
            yield json.dumps({'error': f'Knowledge upload failed: {str(e)}'}, ensure_ascii=False)
    # 删除知识索引
    deleteKnowledge(index)
 if __name__ == "__main__":
    output_folder = "flask_app/static/output/zytest1"
    # truncate0 = os.path.join(output_folder, "ztb_tobidders_notice_table.pdf")
    # truncate1=os.path.join(output_folder,"ztb_evaluation_method.pdf")
    # clause_path = convert_clause_to_json(truncate1, output_folder)
    # truncate0_jsonpath = os.path.join(output_folder, "truncate_output3.json")
    start_time = time.time()
    file_type = 1        #1:docx 2:pdf 3:其他
    input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest11.pdf"
    # file_path = main_processing(output_folder, input_file, file_type, "uuidzyzy11")
    preprocess_files(output_folder, input_file, file_type, "zytest1")
    end_time = time.time()
    elapsed_time = end_time - start_time  # 计算耗时
    print(f"Function execution took {elapsed_time} seconds.")
--- a/flask_app/old_version/文件分类普通版_old.py
+++ b/flask_app/old_version/文件分类普通版_old.py
@ -0,0 +1,350 @@
 # -*- coding: utf-8 -*-
 import os
 import shutil
 from PyPDF2 import PdfReader, PdfWriter
 def validate_pdf(file_path):
    """ 验证PDF文件是否损坏 """
    try:
        with open(file_path, "rb") as file:
            pdf = PdfReader(file)
            return len(pdf.pages) > 0
    except Exception as e:
        print(f"Error reading PDF {file_path}: {str(e)}")
        return False
 def truncate_pdf(source_path, target_path, max_pages=15):
    """截取PDF文件的前15页并保存"""
    try:
        with open(source_path, "rb") as infile:
            reader = PdfReader(infile)
            writer = PdfWriter()
            for i in range(min(max_pages, len(reader.pages))):
                writer.add_page(reader.pages[i])
            with open(target_path, "wb") as outfile:
                writer.write(outfile)
    except Exception as e:
        print(f"Error processing PDF {source_path}: {str(e)}")
 def copy_file(src, dest):
    """ 复制单个文件 """
    os.makedirs(os.path.dirname(dest), exist_ok=True)
    shutil.copy2(src, dest)
 def copy_directory(source, destination):
    """复制整个目录"""
    os.makedirs(destination, exist_ok=True)
    for item in os.listdir(source):
        src_path = os.path.join(source, item)
        dest_path = os.path.join(destination, item)
        if os.path.isfile(src_path):
            copy_file(src_path, dest_path)
        else:
            copy_directory(src_path, dest_path)
 def unique_file_name(base_path, file_name):
    counter = 1
    name_part, extension = os.path.splitext(file_name)
    new_file_name = file_name
    # 检查文件是否存在，若存在则修改文件名
    while os.path.exists(os.path.join(base_path, new_file_name)):
        new_file_name = f"{name_part}_{counter}{extension}"
        counter += 1
    return new_file_name
 def process_pdf_folders(source_dir, target_dir):
    """ 处理源目录中的PDF文件，并基于条件选择目标文件夹 """
    for root, dirs, files in os.walk(source_dir, topdown=False):
        for file in files:
            if file.lower().endswith('.pdf'):
                source_file_path = os.path.join(root, file)
                if validate_pdf(source_file_path):
                    relative_path = os.path.relpath(root, source_dir)
                    target_file_dir = os.path.join(target_dir, relative_path)
                    target_file_path = os.path.join(target_file_dir, file)
                    copy_file(source_file_path, target_file_path)
                else:
                    print(f"Deleted corrupt file: {source_file_path}")
        # 清除空目录
        if not os.listdir(root):
            os.rmdir(root)
 def classify_folders(source_dir, target_dir, project_index):
    """Classifies folders and processes files based on specific criteria."""
    temp_dir = os.path.join(source_dir, 'temp')
    os.makedirs(temp_dir, exist_ok=True)
    target_project_dir = None
    processed_dirs = set()  # Set to track processed directories
    for subdir in os.listdir(source_dir):
        subdir_path = os.path.join(source_dir, subdir)
        if not os.path.isdir(subdir_path):
            continue
        files = [f.lower() for f in os.listdir(subdir_path)]
        if 'zb.pdf' in files:
            target_project_dir, project_index ,target_zb_name= process_tender_files(subdir_path, temp_dir, target_dir, project_index)
            processed_dirs.add(subdir_path)  # Mark this directory as processed
        elif subdir.lower() == "输出文件":
            process_evaluation_files(subdir_path, target_project_dir)
            processed_dirs.add(subdir_path)  # Mark this directory as processed
    # Process remaining folders, skipping already processed ones
    process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs,target_zb_name)
    #删除tmp目录
    # if os.path.exists(temp_dir):
    #     shutil.rmtree(temp_dir)
    if os.path.exists(source_dir):
        shutil.rmtree(source_dir)
 def process_tender_files(subdir_path, temp_dir, target_dir, project_index):
    """Processes tender files and returns the target project directory and updated index."""
    zb_path = os.path.join(subdir_path, "zb.pdf")
    truncated_zb_path = os.path.join(temp_dir, "truncate_zb.pdf")
    truncate_pdf(zb_path, truncated_zb_path, 30)  # Truncate to the first 30 pages
    bot_response = file_parse(truncated_zb_path, "file_" + str(project_index), 1)
    project_index += 1
    zb_response = extract_answer(bot_response[1], 1)
    zb_response[0]=zb_response[0].replace('/', '-').replace('\\', '-')        #用'-'代替'/' ，目录名不允许出现斜杠
    target_zb_name, category = zb_response[2] + ".pdf", zb_response[3]
    new_zb_path = os.path.join(subdir_path, target_zb_name)
    os.rename(zb_path, new_zb_path)
    target_category_dir = os.path.join(target_dir, category)
    target_project_dir = os.path.join(target_category_dir, zb_response[0] + "_" + zb_response[1])
    copy_directory(subdir_path, os.path.join(target_project_dir, "招标文件夹"))
    os.remove(truncated_zb_path)
    shutil.rmtree(subdir_path)
    return target_project_dir, project_index,zb_response[2]
 def process_evaluation_files(subdir_path, target_project_dir):
    """Processes evaluation folders."""
    copy_directory(subdir_path, os.path.join(target_project_dir, "评标文件夹"))
    shutil.rmtree(subdir_path)
 def process_remaining_folders(source_dir, target_project_dir, project_index, temp_dir, processed_dirs, target_zb_name):
    """Processes remaining folders containing bid files."""
    target_tb_dir = os.path.join(target_project_dir, "投标文件夹")
    for subdir in os.listdir(source_dir):
        subdir_path = os.path.join(source_dir, subdir)
        if not os.path.isdir(subdir_path) or subdir_path in processed_dirs:
            continue  # Skip processed directories
        target_tbcom_dir = None  # Initialize outside the file loop
        for item in os.listdir(subdir_path):
            item_src_path = os.path.join(subdir_path, item)
            new_name = "truncate_" + item
            truncated_tb_path = os.path.join(temp_dir, new_name)
            truncate_pdf(item_src_path, truncated_tb_path)
            bot_response = file_parse(truncated_tb_path, "file_" + str(project_index), 2)
            project_index += 1
            tb_response = extract_answer(bot_response[1], 2)
            if not tb_response:
                continue  # If there's no response, skip this file
            # Initialize target_tbcom_dir only once based on the first processed file
            if not target_tbcom_dir:
                target_tb_name, _ = tb_response
                if(target_tb_name != target_zb_name and target_tb_name != "未知"):
                    target_tbcom_dir = os.path.join(target_tb_dir, target_tb_name)
                    print(target_tbcom_dir)
                    os.makedirs(target_tbcom_dir, exist_ok=True)
            tb_section = tb_response[1] + ".pdf"
            new_tb_path = os.path.join(subdir_path, tb_section)
            # 获取唯一文件名
            new_tb_path = os.path.join(subdir_path, unique_file_name(subdir_path, tb_section))
            os.rename(item_src_path, new_tb_path)
            # Remove temporary truncated file
            os.remove(truncated_tb_path)
        # Copy the whole directory at once after all files have been processed and renamed
        if target_tbcom_dir:
            copy_directory(subdir_path, target_tbcom_dir)
            shutil.rmtree(subdir_path)  # Optionally remove the original directory after copying
 import re
 import re
 def extract_answer(input_string, type):
    # 使用正则表达式匹配所需的信息
    # 第一种模式：项目名称、项目编号、招标人、类别
    # 在每个字段值后都添加了\s*以忽略尾随的空格
    pattern1 = r"项目名称[:：]\s*(.*?)[;；]\s*项目编号[:：]\s*(.*?)[;；]\s*招标人[:：]\s*(.*?)[;；]\s*类别[:：]\s*([^。;；]*).*"
    # 第二种模式：投标人、类别
    pattern2 = r"投标人[:：]\s*(.*?)[;；]\s*类别[:：]\s*([^。;；]*).*"
    if type == 1:
        match = re.search(pattern1, input_string)
        if match:
            print(f"1: {match.group(1).strip()} 2: {match.group(2).strip()} 3: {match.group(3).strip()} 4: {match.group(4).strip()}")
            return [match.group(1).strip(), match.group(2).strip(), match.group(3).strip(), match.group(4).strip()]
        else:
            print("No match found for type 1.")
            return []
    elif type == 2:
        match = re.search(pattern2, input_string)
        if match:
            # 检查是否包含“投标函”，如果是则替换部分内容为“商务文件”
            part = match.group(2).strip()
            if "投标函" in part:
                part = "商务文件"
            return [match.group(1).strip(), part]
        else:
            print("No match found for type 2.")
            return []
 from llama_index.readers.dashscope.base import DashScopeParse
 from llama_index.readers.dashscope.utils import ResultType
 from llama_index.indices.managed.dashscope import DashScopeCloudIndex
 from dashscope import Assistants, Messages, Runs, Threads
 def send_message(assistant, index,documents,message='百炼是什么？'):
    print(f"Query: {message}")
    # create thread.
    # create a thread.
    thread = Threads.create()
    print(thread)
    # create a message.
    message = Messages.create(thread.id, content=message)
    # create run
    run = Runs.create(thread.id, assistant_id=assistant.id)
    print("run:" + str(run))
    # # get run statue
    run_status = Runs.get(run.id, thread_id=thread.id)
    # print(run_status)
    # wait for run completed or requires_action
    run_status = Runs.wait(run.id, thread_id=thread.id)
    # print(run_status)
    # if prompt input tool result, submit tool result.
    run_status = Runs.get(run.id, thread_id=thread.id)
    # print(run_status)
    # verify_status_code(run_status)
    # get the thread messages.
    msgs = Messages.list(thread.id)
    # print(msgs)
    # print(json.dumps(msgs, default=lambda o: o.__dict__, sort_keys=True, indent=4))
    ans = []
    print("运行结果:")
    for message in msgs['data'][::-1]:
        ans.append(message['content'][0]['text']['value'])
        print("content: ", message['content'][0]['text']['value'])
    print("\n")
    deleteFileFromKnowledge(index,documents)
    return ans
 def file_parse(filepath, knowledge_index, type):
    parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
    documents = parse.load_data(file_path=filepath)
    index = DashScopeCloudIndex("文件分类临时知识库")
    index._insert(documents)
    retriever = index.as_retriever()
    pipeline_id = str(retriever.pipeline_id)
    assistant = Assistants.create(
        model='qwen-max',
        name='smart helper',
        description='智能助手，支持知识库查询和插件调用。',
        instructions='请记住以下材料，他们对回答问题有帮助，请你简洁准确地给出回答，不要给出无关内容。${document1}',
        tools=[
            {
                "type": "code_interpreter"
            },
            {
                "type": "rag",
                "prompt_ra": {
                    "pipeline_id": pipeline_id,
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "query_word": {
                                "type": "str",
                                "value": "${document1}"
                            }
                        }
                    }
                }
            }]
    )
    questions1 = "这份招标文件的项目名称是什么？这份文件的招标编号是多少？这份文件的招标人是谁？这份招标文件属于以下哪类招标：服务类、工程类、还是货物类？你可以结合你对招投标的了解和以下内容：工程类招标投标是指建设单位对拟建的工程项目通过法定的程序和方式吸引建设项目的承包单位竞争；\
                 货物类招投标是指以货物作为采购对象的招标，业主或称购货方为获得货物通过招标的形式选择合格的供货商或称供货方，包括原材料、产品、设备、电能和固态、液态、气态物体等；服务类招标又称为服务采购，指的是除了工程和货之外的其他招标投标活动，物招标投标范围包含建设工程的勘察、设计、监理、工程咨询评估、科技项目、科研课题、物业管理、金融保险服务等\
                 请按下列格式给我提供信息,避免无关内容:‘项目名称:XXX;项目编号:XXX;招标人:XXX;类别：XXX‘"
    questions2 = "这份投标文件的投标人是谁？如果文件中未明确提及投标人，投标人以未知替代；这份文件属于哪类投标文件？你的回答仅限于以下三种：商务文件、技术文件、报价文件，不要给出其他回答。请按下列格式给我提供信息,避免无关内容:‘投标人:XXX;类别:XXX’"
    if (type == 1):
        questions = questions1
    elif (type == 2):
        questions = questions2
    return send_message(assistant,index,documents, message=questions)
 def deleteFileFromKnowledge(index,documents):
    # 初始化一个列表来存储所有文档的 ID
    file_ids = []
    # 检查documents是否为列表且不为空
    if isinstance(documents, list) and documents:
        # 遍历每个文档
        for document in documents:
            # 使用属性访问方式获取每个文档的 id_
            # 确保 document 对象有一个名为 id_ 的属性
            file_id = getattr(document, 'id_', None)  # 使用 getattr 防止属性不存在时抛出异常
            if file_id:
                file_ids.append(file_id)  # 将 id 添加到列表中
    index.delete_ref_doc(file_ids)
 def process_directory(base_directory, intermediate_directory, final_directory):
    # 遍历base_directory下的所有子目录
    for subdir in os.listdir(base_directory):
        subdir_path = os.path.join(base_directory, subdir)
        # 确保是目录
        if os.path.isdir(subdir_path):
            # 处理每个项目的目录
            process_pdf_folders(subdir_path, intermediate_directory)
            classify_folders(intermediate_directory, final_directory, 0)
            # 清理临时目录以备下一个项目使用
            if os.path.exists(intermediate_directory):
                shutil.rmtree(intermediate_directory)
 def main(base_directory, intermediate_directory, final_directory):
    # 确保中间目录和最终目录存在
    os.makedirs(intermediate_directory, exist_ok=True)
    os.makedirs(final_directory, exist_ok=True)
    process_directory(base_directory, intermediate_directory, final_directory)
 if __name__ == "__main__":
    base_directory = 'D:\\bidding_trading_files\\招投标文件2023\\2023'
    intermediate_directory = 'D:\\tmp'
    final_directory = 'D:\\output'
    main(base_directory, intermediate_directory, final_directory)    #处理多级目录
    # process_pdf_folders(base_directory, intermediate_directory)        ##处理单级目录
    # classify_folders(intermediate_directory, final_directory, 0)
--- a/flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py
+++ b/flask_app/old_version/文档理解大模型版知识库处理/删除知识库_old.py
@ -0,0 +1,55 @@
 # -*- coding: utf-8 -*-
 # This file is auto-generated, don't edit it. Thanks.
 import os
 from alibabacloud_bailian20231229.client import Client as bailian20231229Client
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_bailian20231229 import models as bailian_20231229_models
 from alibabacloud_tea_util import models as util_models
 from alibabacloud_tea_util.client import Client as UtilClient
 def create_client() -> bailian20231229Client:
    """
    使用AK&SK初始化账号Client
    @return: Client
    @throws Exception
    """
    config = open_api_models.Config(
        access_key_id=os.environ['ALIBABA_CLOUD_ACCESS_KEY_ID'],
        access_key_secret=os.environ['ALIBABA_CLOUD_ACCESS_KEY_SECRET']
    )
    config.endpoint = 'bailian.cn-beijing.aliyuncs.com'
    return bailian20231229Client(config)
 def delete_index(client: bailian20231229Client, workspace_id: str, index_id: str) -> None:
    delete_index_request = bailian_20231229_models.DeleteIndexRequest(
        index_id=index_id
    )
    runtime = util_models.RuntimeOptions()
    headers = {}
    try:
        response = client.delete_index_with_options(workspace_id, delete_index_request, headers, runtime)
        print("routes Response:", response)
    except Exception as error:
        print(error.message)
        print(error.data.get("Recommend"))
        UtilClient.assert_as_string(error.message)
 async def delete_index_async(client: bailian20231229Client, workspace_id: str, index_id: str) -> None:
    delete_index_request = bailian_20231229_models.DeleteIndexRequest(
        index_id=index_id
    )
    runtime = util_models.RuntimeOptions()
    headers = {}
    try:
        response = await client.delete_index_with_options_async(workspace_id, delete_index_request, headers, runtime)
        print("routes Response:", response)
    except Exception as error:
        print(error.message)
        print(error.data.get("Recommend"))
        UtilClient.assert_as_string(error.message)
 if __name__ == '__main__':
    workspace_id = os.environ['DASHSCOPE_WORKSPACE_ID']
    index_id = 'pg5rrsv26x'
    client = create_client()
    delete_index(client, workspace_id, index_id)
--- a/flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py
+++ b/flask_app/old_version/文档理解大模型版知识库处理/知识库操作_old.py
@ -0,0 +1,58 @@
 import os
 import uuid
 from llama_index.readers.dashscope.base import DashScopeParse
 from llama_index.readers.dashscope.utils import ResultType
 from llama_index.indices.managed.dashscope import DashScopeCloudIndex
 from flask_app.old_version.文档理解大模型版知识库处理.删除知识库_old import delete_index, create_client
 def addfileToKnowledge(filepath,knowledge_name):
    parse = DashScopeParse(result_type=ResultType.DASHSCOPE_DOCMIND)
    documents = parse.load_data(file_path=filepath)
    index = DashScopeCloudIndex.from_documents(
        documents,
        knowledge_name,
        verbose=True,
    )
    print("knowledge created successfully!!!")
    # index = DashScopeCloudIndex(knowledge_name)
    # index._insert(documents)
    # return index, documents
    return index
 def deleteKnowledge(index):
    retriever = index.as_retriever()
    index_id = str(retriever.pipeline_id)
    workspace_id = os.environ.get('DASHSCOPE_WORKSPACE_ID')
    client = create_client()
    delete_index(client,workspace_id,index_id)
    print("knowledge old_version successfully!!!")
 def deleteFileFromKnowledge(index, documents):
    # 初始化一个列表来存储所有文档的 ID
    file_ids = []
    # 检查documents是否为列表且不为空
    if isinstance(documents, list) and documents:
        # 遍历每个文档
        for document in documents:
            # 使用属性访问方式获取每个文档的 id_
            # 确保 document 对象有一个名为 id_ 的属性
            file_id = getattr(document, 'id_', None)  # 使用 getattr 防止属性不存在时抛出异常
            if file_id:
                file_ids.append(file_id)  # 将 id 添加到列表中
    print("old_version successfully")
    index.delete_ref_doc(file_ids)
 # 示例用法
 if __name__ == "__main__":
    filepath = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标01.pdf"
    unique_id = str(uuid.uuid4())
    knowledge_name="招标解析"+unique_id
    # index = addfileToKnowledge(filepath,knowledge_name)
    index = DashScopeCloudIndex("招标解析e8cc45f4-cd41-47cf-a5e6-2b7885debfff")
    # 删除文件
    # deleteFileFromKnowledge(index, document)
    deleteKnowledge(index)
--- a/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py
+++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文件解析状态查询_old.py
@ -0,0 +1,33 @@
 from typing import List
 from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
 from alibabacloud_tea_util.client import Client as UtilClient
 from alibabacloud_credentials.client import Client as CredClient
 def query():
  	# 使用默认凭证初始化Credentials Client。
    cred=CredClient()
    config = open_api_models.Config(
        # 通过credentials获取配置中的AccessKey ID
        cred.get_credential().access_key_id,
        # 通过credentials获取配置中的AccessKey Secret
        cred.get_credential().access_key_secret,
    )
    # 访问的域名
    config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
    client = docmind_api20220711Client(config)
    request = docmind_api20220711_models.QueryDocParserStatusRequest(
        # id :  任务提交接口返回的id
        id='docmind-20241008-24363df30d274863894f037dbb7244e8'
    )
    try:
        # 复制代码运行请自行打印 routes 的返回值
        response = client.query_doc_parser_status(request)
        # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头
        # 获取返回结果。建议先把response.body.data转成json，然后再从json里面取具体需要的值。
        print(response.body.data.status)
    except Exception as error:
        # 如有需要，请打印 error
        UtilClient.assert_as_string(error.message)
 query()
--- a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py
+++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析异步提交_old.py
@ -0,0 +1,40 @@
 from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
 from alibabacloud_tea_util.client import Client as UtilClient
 from alibabacloud_tea_util import models as util_models
 from alibabacloud_credentials.client import Client as CredClient
 def submit_file():
  	# 使用默认凭证初始化Credentials Client。
    cred=CredClient()
    config = open_api_models.Config(
        # 通过credentials获取配置中的AccessKey ID
        # access_key_id=cred.get_access_key_id(),
        cred.get_credential().access_key_id,
        # 通过credentials获取配置中的AccessKey Secret
        # access_key_secret=cred.get_access_key_secret()
        cred.get_credential().access_key_secret,
    )
    # 访问的域名
    config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
    client = docmind_api20220711Client(config)
    request = docmind_api20220711_models.SubmitDocParserJobAdvanceRequest(
        # file_url_object : 本地文件流
        file_url_object=open("zbtest4.pdf", "rb"),
        # file_name ：文件名称。名称必须包含文件类型
        file_name='zbtest4.pdf'
        # file_name_extension : 文件后缀格式。与文件名二选一
        # file_name_extension='pdf'
    )
    runtime = util_models.RuntimeOptions()
    try:
        # 复制代码运行请自行打印 routes 的返回值
        response = client.submit_doc_parser_job_advance(request, runtime)
        # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。如下示例为打印返回的业务id格式
        # 获取属性值均以小写开头，
        print(response.body.data.id)
    except Exception as error:
        # 如有需要，请打印 error
        UtilClient.assert_as_string(error.message)
 submit_file()
--- a/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py
+++ b/flask_app/old_version/文档理解大模型版知识库处理/调用文档解析结果获取_old.py
@ -0,0 +1,35 @@
 from typing import List
 from alibabacloud_docmind_api20220711.client import Client as docmind_api20220711Client
 from alibabacloud_tea_openapi import models as open_api_models
 from alibabacloud_docmind_api20220711 import models as docmind_api20220711_models
 from alibabacloud_tea_util.client import Client as UtilClient
 from alibabacloud_credentials.client import Client as CredClient
 def query():
  	# 使用默认凭证初始化Credentials Client。
    cred=CredClient()
    config = open_api_models.Config(
        # 通过credentials获取配置中的AccessKey ID
        cred.get_credential().access_key_id,
        # 通过credentials获取配置中的AccessKey Secret
        cred.get_credential().access_key_secret,
    )
    # 访问的域名
    config.endpoint = f'docmind-api.cn-hangzhou.aliyuncs.com'
    client = docmind_api20220711Client(config)
    request = docmind_api20220711_models.GetDocParserResultRequest(
        # id :  任务提交接口返回的id
        id='docmind-20241008-24363df30d274863894f037dbb7244e8',
        layout_step_size=10,
        layout_num=0
    )
    try:
        # 复制代码运行请自行打印 routes 的返回值
        response = client.get_doc_parser_result(request)
        # API返回值格式层级为 body -> data -> 具体属性。可根据业务需要打印相应的结果。获取属性值均以小写开头
        # 获取返回结果。建议先把response.body.data转成json，然后再从json里面取具体需要的值。
        print(response.body.data)
    except Exception as error:
        # 如有需要，请打印 error
        UtilClient.assert_as_string(error.message)
 query()
--- a/flask_app/old_version/无效标和废标和禁止投标整合_old.py
+++ b/flask_app/old_version/无效标和废标和禁止投标整合_old.py
@ -0,0 +1,440 @@
 # -*- coding: utf-8 -*-
 import json
 import os.path
 import time
 import re
 from flask_app.general.format_change import pdf2docx
 from flask_app.general.通义千问long import upload_file, qianwen_long
 from concurrent.futures import ThreadPoolExecutor
 from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.old_version.不得存在及禁止投标情形_old import find_forbidden, process_string_list
 #处理跨页的段落
 def preprocess_paragraphs(paragraphs):
    processed = []  # 初始化处理后的段落列表
    index = 0
    while index < len(paragraphs):
        current_text = paragraphs[index].text.strip()  # 去除当前段落的前后空白
        # 检查当前段落是否为空
        if current_text == '':
            # 确保有前一个和后一个段落
            if 0 < index < len(paragraphs) - 1:
                prev_text = paragraphs[index - 1].text.strip()  # 获取前一个段落的文本
                next_text = paragraphs[index + 1].text.strip()  # 获取后一个段落的文本
                # 检查前一个段落的文本是否不以标点符号结尾
                if not prev_text.endswith(('，', ',', '。', '!', '?')):
                    # 定义列表项的模式
                    list_item_pattern = r'^\s*([（$]\d+[）$]|[A-Za-z]\.\s*|[一二三四五六七八九十]+、)'
                    # 检查前一个段落是否以列表模式开头，且后一个段落不以列表模式开头
                    # is_prev_list = re.match(list_item_pattern, prev_text)
                    is_next_list = re.match(list_item_pattern, next_text)
                    if not is_next_list and len(prev_text) > 30:
                        # 合并前一个和后一个段落的文本
                        merged_text = prev_text + next_text  # 为了可读性添加空格
                        if processed:
                            processed[-1] = merged_text  # 更新处理后的最后一个段落
                        else:
                            processed.append(merged_text)  # 如果列表为空，直接添加合并后的文本
                        # 跳过下一个段落，因为它已经被合并
                        index += 2
                        continue
        else:
            # 非空段落，添加到处理后的列表中
            processed.append(current_text)
        index += 1
    return processed
 #如果当前段落有序号，则向下匹配直接遇到相同的序号样式
 #如果当前段落无序号，则向下匹配序号，把若干同类的序号都摘出来。
 def extract_text_with_keywords(doc_path, keywords, follow_up_keywords):
    from collections import OrderedDict
    from docx import Document
    import re
    if isinstance(keywords, str):
        keywords = [keywords]
    doc = Document(doc_path)
    extracted_paragraphs = OrderedDict()
    continue_collecting = False
    current_section_pattern = None
    active_key = None
    def match_keywords(text, patterns):
        return any(re.search(pattern, text, re.IGNORECASE) for pattern in patterns)
    def extract_from_text(text, current_index):
        nonlocal continue_collecting, current_section_pattern, active_key
        if text == "":
            return current_index
        if continue_collecting:
            if current_section_pattern and re.match(current_section_pattern, text):
                continue_collecting = False
                active_key = None
            else:
                if active_key is not None:
                    extracted_paragraphs[active_key].append(text)
                    return current_index
        if match_keywords(text, keywords):
            active_key = text
            extracted_paragraphs[active_key] = [text]
            if match_keywords(text, follow_up_keywords):
                continue_collecting = True
                section_number = re.match(r'^(\d+([.．]\d+)*)\s*[.．]?', text)  # 修改后的正则，支持 '数字 、' 和 '数字.'
                if section_number:
                    current_section_number = section_number.group(1)
                    level_count = current_section_number.count('.')
                    # Pattern to match current level, e.g., 3.4.5
                    pattern = r'^' + (r'\d+\s*[.．]\s*') * level_count + r'\d+'
                    # Generate patterns for next section at same level and parent level
                    parts = current_section_number.split('.')
                    matched_patterns = [pattern]  # start with the full pattern
                    # Next section at same level
                    parts[-1] = str(int(parts[-1]) + 1)
                    next_pattern = r'^' + r'\s*\.\s*'.join(parts)
                    matched_patterns.append(next_pattern)
                    # Parent section (if applicable)
                    if len(parts) > 1:
                        parent_section_parts = parts[:-1]
                        parent_section_parts[-1] = str(int(parent_section_parts[-1]) + 1)
                        parent_pattern = r'^' + r'\s*\.\s*'.join(parent_section_parts)
                        matched_patterns.append(parent_pattern)
                    # Combine the patterns
                    combined_pattern = r'(' + r')|('.join(matched_patterns) + r')'
                    current_section_pattern = re.compile(combined_pattern)
                else:
                    found_next_number = False
                    current_section_pattern = None
                    while current_index < len(doc.paragraphs) - 1:
                        current_index += 1
                        next_text = doc.paragraphs[current_index].text.strip()
                        if not found_next_number:
                            next_section_number = re.match(r'^([A-Za-z0-9]+(?:\.[A-Za-z0-9]+)*)|(\(\d+\))', next_text)
                            if next_section_number:
                                found_next_number = True
                                if next_section_number.group(1):
                                    section_parts = next_section_number.group(1).split('.')
                                    dynamic_pattern = r'^' + r'\.'.join([r'[A-Za-z0-9]+' for _ in section_parts]) + r'\b'
                                elif next_section_number.group(2):
                                    dynamic_pattern = r'^[\(\（]\d+[\)\）]'
                                current_section_pattern = re.compile(dynamic_pattern)
                        if current_section_pattern and re.match(current_section_pattern, next_text):
                            extracted_paragraphs[active_key].append(next_text)
                        else:
                            continue_collecting = False
                            active_key=None
                            break
        return current_index
    processed_paragraphs = preprocess_paragraphs(doc.paragraphs)
    index = 0
    while index < len(processed_paragraphs):
        index = extract_from_text(processed_paragraphs[index].strip(), index)
        index += 1
    return extracted_paragraphs
 def preprocess_text_list(text_list):
    new_text_list = []
    # 正则表达式匹配中文字符或标点后的空格，该空格后紧跟字母、数字或带括号的数字
    split_pattern = re.compile(r'(?<=[\u4e00-\u9fff。；！？?!;])(?=\s+[a-zA-Z\d]|\s+\([1-9]\d*\)|\s+\（[1-9]\d*\）)')
    for text in text_list:
        # 使用正则表达式检查并拆分元素
        parts = split_pattern.split(text)
        new_text_list.extend(part.strip() for part in parts if part.strip())  # 添加非空字符串检查
    return new_text_list
 def clean_dict_datas(extracted_contents, keywords,excludes):    #让正则表达式提取到的东西格式化
    all_texts1 = []
    all_texts2=[]
    # 定义用于分割句子的正则表达式，包括中文和西文的结束标点
    split_pattern = r'(?<=[。！？\!\?])'
    for key, text_list in extracted_contents.items():
        if len(text_list) == 1:
            for data in text_list:
                # 检查是否包含任何需要排除的字符串
                if any(exclude in data for exclude in excludes):
                    continue  # 如果包含任何排除字符串，跳过这个数据
                # 去掉开头的序号，包括字母+数字的格式 以及括号+数字
                pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                data = re.sub(pattern, '', data).strip()
                keyword_match = re.search(keywords, data)
                if keyword_match:
                    # 从关键词位置开始查找结束标点符号
                    start_pos = keyword_match.start()
                    # 截取从关键词开始到后面的内容
                    substring = data[start_pos:]
                    # 按定义的结束标点分割
                    sentences = re.split(split_pattern, substring, 1)
                    if len(sentences) > 0 and sentences[0]:
                        # 只取第一句，保留标点
                        cleaned_text = data[:start_pos] + sentences[0] #eg:经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。潜在投标人应自行承担现场考察的全部费用、责任和风险。
                                                                        # 经采购人允许，潜在投标人可进入项目现场进行考察，但潜在投标人不得因此使采购人承担有关责任和蒙受损失。
                    else:
                        cleaned_text = data  # 如果没有标点，使用整个字符串
                else:
                    # 如果没有找到关键词，保留原文本
                    cleaned_text = data
                # 删除空格
                cleaned_text_no_spaces = cleaned_text.replace(' ', '').replace('　', '')
                # 如果长度大于8，则添加到结果列表
                if len(cleaned_text_no_spaces) > 8:
                    all_texts1.append(cleaned_text_no_spaces)
        else:
            # print(text_list)
            new_text_list=preprocess_text_list(text_list)
            # print(new_text_list)
            pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
            # data = re.sub(pattern, '', new_text_list[0]).strip()     #去除序号
            data = re.sub(pattern, '', new_text_list[0]).replace(' ','').strip()
            # 将修改后的第一个元素和剩余的元素连接起来
            new_text_list[0] = data  # 更新列表中的第一个元素
            joined_text = "\n".join(new_text_list)  # 如果列表中有多个元素，则连接它们
            # 删除空格
            joined_text_no_spaces = joined_text.replace(' ', '').replace('　', '')
            all_texts2.append(joined_text_no_spaces)  # 将每个列表的内容添加到 all_texts 中
    return all_texts1,all_texts2        #all_texts1要额外用gpt   all_text2直接返回结果
 def find_sentences_with_keywords(data, keywords, follow_up_keywords):
    """递归查找并返回包含关键词的句子列表，并根据是否存在后续关键词分别存储到两个列表中。"""
    sentences1 = []  # 保存没有后续关键词的情况
    sentences2 = []  # 保存有后续关键词的情况
    if isinstance(data, dict):
        for value in data.values():
            result1, result2 = find_sentences_with_keywords(value, keywords, follow_up_keywords)
            sentences1.extend(result1)
            sentences2.extend(result2)
    elif isinstance(data, list):
        for item in data:
            result1, result2 = find_sentences_with_keywords(item, keywords, follow_up_keywords)
            sentences1.extend(result1)
            sentences2.extend(result2)
    elif isinstance(data, str):
        # 分割句子，保证句子完整性（按标点符号和序号分割）
        # split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)  # 扩展匹配序号分割
        split_sentences = re.split(r'(?<=[。！？\!\?])|(?=\d+\.\d+)|(?=\d+[\、\.])|(?=[（(]\d+[)）])', data)
        i = 0
        while i < len(split_sentences):
            sentence = split_sentences[i].strip()
            if re.search(keywords, sentence, re.IGNORECASE):
                follow_up_present = any(
                    re.search(follow_up, sentence, re.IGNORECASE) for follow_up in follow_up_keywords)
                if follow_up_present:
                    # 如果存在后续关键词，则从当前位置开始截取
                    start_index = i
                    end_index = start_index
                    found_next_section = False
                    for j in range(start_index + 1, len(split_sentences)):
                        if re.match(r'\d+\.\d+(\.\d+)?', split_sentences[j].strip()):
                            end_index = j
                            found_next_section = True
                            break
                    if found_next_section:
                        full_text = ' '.join(split_sentences[start_index:end_index]).strip()
                    else:
                        full_text = ' '.join(split_sentences[start_index:]).strip()
                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                    # data=re.sub(pattern,'',full_text)
                    data = re.sub(pattern, '', full_text).replace(' ','').strip()
                    sentences2.append(data)  # 存储有后续关键词的情况
                    i = end_index if found_next_section else len(split_sentences)
                else:
                    # pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]?\d+(\.\d+)*(\s|\.|、)?)'
                    pattern = r'^\s*([（(]\d+[)）]|[A-Za-z]\.\s*|[A-Za-z]?\d+\s*(\.\s*\d+)*(\s|\.|、|．)?|[一二三四五六七八九十]+、)'
                    # data = re.sub(pattern, '', sentence).replace('\n','').strip()
                    cleaned_sentence = re.sub(pattern, '', sentence).replace('\n', '').replace(' ','').strip()
                    if len(cleaned_sentence) > 8:
                        sentences1.append(cleaned_sentence)  # 存储没有后续关键词的情况
                    i += 1
            else:
                i += 1
    return sentences1, sentences2  # 返回两个列表
 def extract_sentences_from_json(json_path, keywords,follow_up_keywords):
    if not json_path:
        return [],[]
    with open(json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    """从JSON数据中提取包含关键词的句子。"""
    return find_sentences_with_keywords(data, keywords,follow_up_keywords)
 #处理无效投标
 def extract_values_if_contains(data, includes):
    """
    递归检查字典中的值是否包含列表 'includes' 中的内容。
    如果包含，将这些值添加到一个列表中并返回。
    参数:
    data (dict): 字典或从 JSON 解析得到的数据。
    includes (list): 包含要检查的关键词的列表。
    返回:
    list: 包含满足条件的值的列表。
    """
    included_values = []  # 初始化结果列表
    # 定义递归函数来处理嵌套字典
    def recursive_search(current_data):
        if isinstance(current_data, dict):
            for key, value in current_data.items():
                if isinstance(value, dict):
                    # 如果值是字典，递归搜索
                    recursive_search(value)
                elif isinstance(value, str):
                    # 如果值是字符串，检查是否包含任何 includes 中的关键词
                    if any(include in value for include in includes):
                        included_values.append(value)
        elif isinstance(current_data, list):
            for item in current_data:
                # 如果是列表，递归每个元素
                recursive_search(item)
    # 开始递归搜索
    recursive_search(data)
    return included_values
 #你是一个文本助手，文本内的信息以'...............'分割，你负责准确筛选所需的信息并返回，每块信息要求完整，不遗漏，你不得擅自进行总结或删减。
 #以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或使投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号。
 #以上是原文内容，文本内的信息以'...............'分割，请你根据该信息回答：否决投标或拒绝投标或无效投标或使投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选所需的信息并返回。最终结果以json列表格式返回给我，键名为'否决和无效投标情形'，你的回答完全忠于原文内容，且回答内容与原文内容一致，要求完整与准确，不能擅自总结或者概括。",
 #TODO:truncate_json_path为空的时候，单独提取表格数据
 def handle_query(file_path, user_query, output_file, result_key, keywords, truncate_json_path):
    try:
        excludes = ["说明表", "重新招标", "否决所有", "否决投标的条件", "备注：", "本人保证："]
        follow_up_keywords = [r'情\s*形\s*之\s*一', r'情\s*况\s*之\s*一', r'下\s*列', r'以\s*下']
        extracted_contents = extract_text_with_keywords(file_path, [keywords], follow_up_keywords)  # 提取正文（除表格）
        # print(extracted_contents)
        all_texts1, all_texts2 = clean_dict_datas(extracted_contents, keywords, excludes)  # 列表
        all_tables1, all_tables2 = extract_sentences_from_json(truncate_json_path, keywords, follow_up_keywords)  # 提取表格数据（json_data）
        qianwen_txt = all_texts1 + all_tables1
        selected_contents = set()  # 使用 set 去重
        if qianwen_txt:
            with open(output_file, 'w', encoding='utf-8') as file:
                counter = 1
                for content in qianwen_txt:
                    file.write("..............." + '\n')
                    file.write(f"{counter}. {content}\n")
                    counter += 1
            file_id = upload_file(output_file)
            # qianwen_ans = qianwen_long(file_id, user_query)
            qianwen_ans = qianwen_long(file_id, user_query)
            num_list = process_string_list(qianwen_ans)
            print(result_key + "选中的序号:" + str(num_list))
            for index in num_list:
                if 1 <= index <= len(qianwen_txt):
                    content = qianwen_txt[index - 1]
                    selected_contents.add(content)
        # 无论 qianwen_txt 是否为空，都添加 all_texts2 和 all_tables2 的内容
        selected_contents.update(all_texts2)
        selected_contents.update(all_tables2)
        # 如果 selected_contents 不为空，则返回结果，否则返回空字符串
        if selected_contents:
            res = {result_key: list(selected_contents)}
        else:
            res = {result_key: ""}
        return res
    except Exception as e:
        print(f"handle_query 在处理 {result_key} 时发生异常: {e}")
        return {result_key: ""}
 def combine_find_invalid(invalid_docpath, output_dir, tobidders_notice_table, clause_path, qualification):
    tobidders_notice_table_docx = pdf2docx(tobidders_notice_table)  # 投标人须知前附表转docx
    truncate_jsonpath = extract_tables_main(tobidders_notice_table_docx, output_dir)  # 投标人须知前附表docx->json
    queries = [
        (
            r'否\s*决|无\s*效\s*投\s*标|被\s*拒\s*绝|予\s*以\s*拒\s*绝|投\s*标\s*失\s*效|投\s*标\s*无\s*效',
            "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：否决投标或拒绝投标或无效投标或投标失效的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
            os.path.join(output_dir, "temp1.txt"),
            "否决和无效投标情形"
        ),
        (
            r'废\s*标',
            "以上是从招标文件中摘取的内容，文本内之间的信息以'...............'分割，请你根据该内容回答：废标项的情况有哪些？文本中可能存在无关的信息，请你准确筛选符合的信息并将它的序号返回。请以[x,x,x]格式返回给我结果，x为符合的信息的序号，若情况不存在，返回[]。",
            os.path.join(output_dir, "temp2.txt"),
            "废标项"
        )
    ]
    results = []
    # 使用线程池来并行处理查询
    with ThreadPoolExecutor() as executor:
        futures = []
        for keywords, user_query, output_file, result_key in queries:
            future = executor.submit(
                handle_query,
                invalid_docpath,
                user_query,
                output_file,
                result_key,
                keywords,
                truncate_jsonpath
            )
            futures.append((future, result_key))
            time.sleep(0.5)  # 暂停0.5秒后再提交下一个任务
        # 按照提交的顺序收集结果
        for future, result_key in futures:
            try:
                result = future.result()
            except Exception as e:
                print(f"线程处理 {result_key} 时出错: {e}")
                result = {result_key: ""}
            results.append(result)
    # 禁止投标（find_forbidden）部分
    try:
        # print("starting不得存在的情形...")
        forbidden_res = find_forbidden(truncate_jsonpath, clause_path, qualification)
    except Exception as e:
        print(f"find_forbidden 处理时出错: {e}")
        forbidden_res = {'不得存在的其他情形': ""}
    results.append(forbidden_res)
    combined_dict = {}
    for d in results:
        combined_dict.update(d)
    # print("无效标与废标done...")
    return {"无效标与废标项": combined_dict}
 if __name__ == '__main__':
    start_time = time.time()
    tobidders_notice_table=""
    clause_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\clause1.json"
    qualification="C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_qualification.pdf"
    output_dir = "C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\zbout"
    # doc_path = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\temp7\\3abb6e16-19db-42ad-9504-53bf1072dfe7\\ztbfile_invalid.docx'
    invalid_docpath = 'C:\\Users\\Administrator\\Desktop\\fsdownload\\006decc2-b3b5-4898-9b9f-4b0eab4e173f\\ztbfile_invalid.docx'
    results = combine_find_invalid(invalid_docpath, output_dir,tobidders_notice_table,clause_path,qualification)
    end_time = time.time()
    print("Elapsed time:", str(end_time - start_time))
    print("Results:", json.dumps(results,ensure_ascii=False,indent=4))
--- a/flask_app/old_version/解析old_old.py
+++ b/flask_app/old_version/解析old_old.py
@ -0,0 +1,221 @@
 # -*- encoding:utf-8 -*-
 import json
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor
 from flask_app.main.截取pdf import truncate_pdf_multiple
 from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.main.无效标和废标和禁止投标整合 import combine_find_invalid
 from flask_app.main.投标人须知正文提取指定内容工程标 import extract_from_notice
 import concurrent.futures
 from flask_app.main.基础信息整合快速版 import combine_basic_info
 from flask_app.main.资格审查模块 import combine_review_standards
 from flask_app.main.商务评分技术评分整合 import combine_evaluation_standards
 from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
 from flask_app.general.docx截取docx import copy_docx
 def get_global_logger(unique_id):
    if unique_id is None:
        return logging.getLogger()  # 获取默认的日志器
    logger = logging.getLogger(unique_id)
    return logger
 logger=None
 # 创建全局线程池
 executor = ThreadPoolExecutor()
 def preprocess_files(output_folder, downloaded_file_path, file_type,unique_id):
    logger.info("starting 文件预处理...")
    logger.info("output_folder..." + output_folder)
    # 根据文件类型处理文件路径
    if file_type == 1:  # docx
        docx_path = downloaded_file_path
        pdf_path = docx2pdf(docx_path)  # 将docx转换为pdf以供后续处理
    elif file_type == 2:  # pdf
        pdf_path = downloaded_file_path
        docx_path = pdf2docx(pdf_path)  # 将pdf转换为docx以供上传到知识库
    elif file_type == 3: #doc
        pdf_path=docx2pdf(downloaded_file_path)
        docx_path=doc2docx(downloaded_file_path)
    else:
        logger.error("Unsupported file type provided. Preprocessing halted.")
        return None
    # 调用截取PDF多次
    truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id)
    # 处理各个部分
    truncate0_docpath = pdf2docx(truncate_files[0])  # 投标人须知前附表转docx
    invalid_path=truncate_files[5]
    invalid_docpath = copy_docx(docx_path)  # docx截取无效标部分
    # invalid_docpath=pdf2docx(invalid_path)
    truncate_jsonpath = extract_tables_main(truncate0_docpath, output_folder)  # 投标人须知前附表docx->json
    truncate0 = truncate_files[0]
    truncate1 = truncate_files[1]
    truncate2=truncate_files[2]
    truncate3 = truncate_files[3]
    merged_baseinfo_path=truncate_files[-1]
    clause_path = convert_clause_to_json(truncate_files[2], output_folder)  # 投标人须知正文条款pdf->json
    logger.info("文件预处理done")
    # 返回包含预处理后文件路径的字典
    return {
        'file_path': downloaded_file_path,
        'output_folder': output_folder,
        'invalid_path':invalid_path,
        'truncate0': truncate0,
        'truncate1': truncate1,
        'truncate2':truncate2,
        'truncate3': truncate3,
        'truncate0_jsonpath': truncate_jsonpath,
        'merged_baseinfo_path':merged_baseinfo_path,
        'clause_path': clause_path,
        'invalid_docpath': invalid_docpath
    }
 def post_processing(data,includes):
    # 初始化结果字典，预设'其他'分类为空字典
    result = {"其他": {}}
    # 遍历原始字典的每一个键值对
    for key, value in data.items():
        if key in includes:
            # 如果键在includes列表中，直接保留这个键值对
            result[key] = value
        else:
            # 如果键不在includes列表中，将这个键值对加入到'其他'分类中
            result["其他"][key] = value
    # 如果'其他'分类没有任何内容，可以选择删除这个键
    if not result["其他"]:
        del result["其他"]
    return result
 # 基本信息
 def fetch_project_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path):  # 投标人须知前附表
    logger.info("starting基础信息...")
    basic_res = combine_basic_info(merged_baseinfo_path, truncate0, truncate2, clause_path)
    logger.info("基础信息done")
    return basic_res
 # 形式、响应、资格评审
 def fetch_qualification_review(truncate1, truncate3,output_folder, truncate0_jsonpath, clause_path,invalid_path,merged_baseinfo_path):
    logger.info("starting资格审查...")
    review_standards_res = combine_review_standards(truncate1, truncate3,output_folder, truncate0_jsonpath,
                                                    clause_path,invalid_path,merged_baseinfo_path)
    logger.info("资格审查done")
    return review_standards_res
 # 评分细则 流式
 def fetch_evaluation_standards(truncate1):  # 评标办法前附表
    logger.info("starting 商务标和技术标...")
    # 获取评标办法前附表的字典结果
    evaluation_standards_res = combine_evaluation_standards(truncate1)
    # 获取技术标和商务标
    technical_standards = {"技术评分": evaluation_standards_res.get("技术评分", {})}
    commercial_standards = {"商务评分": evaluation_standards_res.get("商务评分", {})}
    logger.info("商务标和技术标 done")
    # 返回将 "技术标" 和 "商务标" 包含在新的键中
    return {
        "technical_standards": technical_standards,
        "commercial_standards": commercial_standards
    }
 # 无效、废标项解析
 def fetch_invalid_requirements(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3):
    # 废标项要求：千问
    logger.info("starting无效标与废标...")
    find_invalid_res = combine_find_invalid(invalid_docpath, output_folder, truncate0_jsonpath, clause_path, truncate3)
    logger.info("无效标与废标done...")
    return find_invalid_res
 # 投标文件要求
 def fetch_bidding_documents_requirements(clause_path):
    logger.info("starting投标文件要求...")
    fetch_bidding_documents_requirements_json = extract_from_notice(clause_path, 1)
    logger.info("投标文件要求done...")
    return {"投标文件要求":fetch_bidding_documents_requirements_json}
 # 开评定标流程
 def fetch_bid_opening(clause_path):
    logger.info("starting开评定标流程...")
    fetch_bid_opening_json = extract_from_notice(clause_path, 2)
    logger.info("开评定标流程done...")
    return {"开评定标流程":fetch_bid_opening_json}
 #分段返回
 def engineering_bid_main(output_folder, downloaded_file_path, file_type, unique_id):
    global logger
    logger = get_global_logger(unique_id)
    # 预处理文件，获取处理后的数据
    processed_data = preprocess_files(output_folder, downloaded_file_path, file_type)
    if not processed_data:
        yield json.dumps({})  # 如果处理数据失败，返回空的 JSON
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # 立即启动不依赖 knowledge_name 和 index 的任务
        futures = {
            'base_info': executor.submit(fetch_project_basic_info, processed_data['merged_baseinfo_path'],
                                         processed_data['truncate0'],
                                         processed_data['truncate2'], processed_data['clause_path']),
            'qualification_review': executor.submit(fetch_qualification_review, processed_data['truncate1'],
                                                    processed_data['truncate3'], output_folder,
                                                    processed_data['truncate0_jsonpath'],
                                                    processed_data['clause_path'], processed_data['invalid_path'],
                                                    processed_data['merged_baseinfo_path']),
            'evaluation_standards': executor.submit(fetch_evaluation_standards, processed_data['truncate1']),
            'invalid_requirements': executor.submit(fetch_invalid_requirements, processed_data['invalid_docpath'],
                                                    output_folder, processed_data['truncate0_jsonpath'],
                                                    processed_data['clause_path'], processed_data['truncate3']),
            'bidding_documents_requirements': executor.submit(fetch_bidding_documents_requirements, processed_data['clause_path']),
            'opening_bid': executor.submit(fetch_bid_opening, processed_data['clause_path'])
        }
        # 提前处理这些不依赖的任务，按完成顺序返回
        for future in concurrent.futures.as_completed(futures.values()):
            key = next(k for k, v in futures.items() if v == future)
            try:
                result = future.result()
                # 如果是 evaluation_standards，拆分技术标和商务标
                if key == 'evaluation_standards':
                    technical_standards = result["technical_standards"]
                    commercial_standards = result["commercial_standards"]
                    # 分别返回技术标和商务标
                    yield json.dumps({'technical_standards': transform_json_values(technical_standards)}, ensure_ascii=False)
                    yield json.dumps({'commercial_standards': transform_json_values(commercial_standards)}, ensure_ascii=False)
                else:
                    # 处理其他任务的结果
                    yield json.dumps({key: transform_json_values(result)}, ensure_ascii=False)
            except Exception as exc:
                logger.error(f"Error processing {key}: {exc}")
                yield json.dumps({'error': f'Error processing {key}: {str(exc)}'}, ensure_ascii=False)
 if __name__ == "__main__":
    start_time = time.time()
    output_folder = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test1"
    file_type = 2        #1:docx 2:pdf 3:其他
    input_file = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest2.pdf"
    print("yes")
    for output in engineering_bid_main(output_folder, input_file, file_type, "zytest"):
        print(output)
    end_time = time.time()
    elapsed_time = end_time - start_time  # 计算耗时
    print(f"Function execution took {elapsed_time} seconds.")
--- a/flask_app/old_version/资格审查模块old_old.py
+++ b/flask_app/old_version/资格审查模块old_old.py
@ -0,0 +1,42 @@
 import os
 from flask_app.main.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import extract_content_from_json
 from flask_app.old_version.形式响应评审old import process_reviews
 from flask_app.old_version.资格评审old_old import process_qualification
 from flask_app.general.通义千问long import upload_file, qianwen_long
 from concurrent.futures import ThreadPoolExecutor
 def combine_review_standards(truncate1,truncate3,knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder):   #评标办法前附表
    # 形式评审、响应评审:千问
    file_id=upload_file(truncate1)    #评标办法前附表
    user_query_1 = "根据该文档中的评标办法前附表，请你列出该文件中的形式评审标准和响应性评审标准和资格评审标准，请以json格式返回，外层键名为'形式评审标准'和'响应性评审标准'和'资格评审标准',嵌套键名为'评审因素'中的内容，相应的键值为对应'评审标准'中的内容。"
    results = qianwen_long(file_id, user_query_1)
    original_dict_data = extract_content_from_json(results)
    qualification_review = original_dict_data.pop('资格评审标准', '默认值或None')    #qianwen-long有关资格评审的内容
    with ThreadPoolExecutor() as executor:
        # 创建Future对象
        future_qualification = executor.submit(process_qualification, qualification_review, truncate3, knowledge_name)
        future_form_response = executor.submit(process_reviews, original_dict_data, knowledge_name, truncate0_jsonpath,
                                               clause_path,input_file,output_folder)
        # 等待执行结果
        final_qualify_json = future_qualification.result()
        form_response_dict = future_form_response.result()
    form_response_dict.update(final_qualify_json)
    # return nest_json_under_key(form_response_dict,"资格审查")
    return {"资格审查":form_response_dict}
 if __name__ == "__main__":
    input_file="D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21\\ztbfile.pdf"
    output_folder = "D:\\flask_project\\flask_app\\static\\output\\c02a12c2-6f7b-49dc-b97f-c3d740c96c21"
    # truncate0 = os.path.join(output_folder, "zbtest20_tobidders_notice_table.pdf")
    truncate2=os.path.join(output_folder,"ztbfile_tobidders_notice.pdf")
    knowledge_name="zbtest20"
    truncate1=os.path.join(output_folder,"ztbfile_evaluation_method.pdf")
    truncate3=os.path.join(output_folder,"ztbfile_qualification.pdf")
    clause_path = convert_clause_to_json(truncate2, output_folder)
    truncate0_jsonpath = os.path.join(output_folder, "truncate_output.json")
    res=combine_review_standards(truncate1,truncate3, knowledge_name,truncate0_jsonpath,clause_path,input_file,output_folder)
    print(res)
--- a/flask_app/old_version/资格评审old_old.py
+++ b/flask_app/old_version/资格评审old_old.py
@ -0,0 +1,182 @@
 # -*- encoding:utf-8 -*-
 # 资格审查中，首先排除'联合体投标'和'不得存在的情况',有'符合'等的，加入matching_keys列表，否则保留原字典
 import json
 import re
 from flask_app.general.json_utils import clean_json_string, combine_json_results, add_keys_to_json
 from flask_app.general.多线程提问 import multi_threading, read_questions_from_file
 from flask_app.general.通义千问long import upload_file
 def merge_dictionaries_under_common_key(dicts, common_key):
    # 初始化一个空字典来保存合并的结果
    merged_dict = {common_key: {}}
    # 遍历列表中的每个字典
    for d in dicts:
        if common_key in d:
            # 使用字典解包来合并字典
            merged_dict[common_key].update(d[common_key])
        else:
            print(f"资格评审： Warning: Dictionary does not contain the key {common_key}")
    return merged_dict
 def generate_qual_question(matching_keys_list):  # 这里假设资质、信誉与人员要求 要不都有、要不都没
    if not matching_keys_list:
        return []
    else:
        questions = []
        # 将列表转换为单引号包裹的格式，并用逗号和空格分隔
        formatted_keys = ["'{}'".format(key) for key in matching_keys_list]
        # 将格式化后的关键词列表连接成字符串
        keys_string = "、".join(formatted_keys)
        # 构造完整的问题语句
        question1 = (f"该招标文件中资格评审的内容是怎样的？具体内容包括{keys_string}，"
                     "请你以json格式返回结果，外层键名为'资格评审'，嵌套键名为具体的字段，请你忠于原文，回答要求完整准确，不要擅自总结、删减。")
        question2 = "该招标文件中资格评审中有关人员资格的要求是怎样的？请依次给出所需的岗位、需要的数量、资格要求、需要提交的证明材料（如具体的社保证明、技能证书等，若有时间要求请注明时间范围）、在岗要求、备注，若相关要求不存在，则无需返回该键值对。请你以json格式返回结果，外层键名为'资格评审'，嵌套键名为具体的要求，请你忠于原文，回答要求完整准确，不要擅自总结、删减。"
        questions.append(question1)
        questions.append(question2)
        return questions
 def extract_matching_keys_qual(dict_data):
    # 定义包含模式的列表
    include_patterns = [re.compile(r"第.*?章"), re.compile(r"第.*?款"), re.compile(r"第.*?项"), re.compile(r"第.*?目"),
                        re.compile(r"符合")]
    # 初始化列表，用于存储匹配的键
    matching_keys = []
    non_matching_keys = {}
    # 定义排除项
    excludes = ['联合体', '禁止投标', '不存在', '不得存在', '资格', '管理机构', '负责人', '人员']  # 联合体、禁止投标的情况、人员需要额外问
    # 遍历字典中的每个键值对
    for key, value in dict_data.items():
        if "附件" in key and "资质" in key:
            return [], []  # 如果同时出现，立即返回空列表
        # 检查值是否符合任何一个包含模式
        if any(pattern.search(value) for pattern in include_patterns):
            # 如果值符合包含模式，再检查键是否包含任何排除项
            if not any(ex in key for ex in excludes):
                # 如果键不包含排除项，则添加到匹配键列表中
                matching_keys.append(key)
        else:
            # value中有实质的内容，不需要额外问。
            non_matching_keys[key] = value
    return matching_keys, non_matching_keys  # matching:['资质条件', '财务状况']   non_matching_keys:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'}
 # 获取联合体投标的要求，由于它不一定在资格审查表中，故调用rag
 def get_consortium_dict(knowledge_name):
    qualify_list = []
    consortium_questions = [
        "该招标文件对于联合体投标的要求是怎样的，请按json格式给我提供信息，外层键名为'联合体投标要求（如有）'，嵌套键名为你对该要求的总结，而键值需要完全与原文保持一致，不要擅自总结、删减。"]
    results1 = multi_threading(consortium_questions, knowledge_name)
    for _, response in results1:  # _占位，代表ques;response[0]也是ques;response[1]是ans
        try:
            if response and len(response) > 1:  # 检查response存在且有至少两个元素
                qualify_list.append(response[1])
            else:
                print(f"资格评审： Warning: Missing or incomplete response data for query index {_}.")
        except Exception as e:
            print(f"资格评审： Error processing response for query index {_}: {e}")
    consortium_dict = combine_json_results(qualify_list)
    return consortium_dict
 def get_all_dict(knowledge_name):
    qualification_review_file_path = 'flask_app/static/提示词/资格评审.txt'
    questions = read_questions_from_file(qualification_review_file_path)
    qualification_list = []
    res1 = multi_threading(questions, knowledge_name)
    for _, response in res1:  # _占位，代表ques;response[0]也是ques;response[1]是ans
        try:
            if response and len(response) > 1:  # 检查response存在且有至少两个元素
                qualification_list.append(response[1])
            else:
                print(f"资格评审： Warning: Missing or incomplete response data for query index {_}.")
        except Exception as e:
            print(f"资格评审： Error processing response for query index {_}: {e}")
    qualification_combined_res = combine_json_results(qualification_list)
    return {'资格评审': qualification_combined_res}
 def process_qualification(qualification_review, truncate3, knowledge_name):
    # 资格评审
    matching_keys_list, non_matching_dict = extract_matching_keys_qual(
        qualification_review)  # matching_keys_list:['资质条件', '财务状况']   non_matching_dict:{'营业执照': '具备有效的营业执照', '施工机械设备': '具备完善的施工设备'}
    if not matching_keys_list:
        if not non_matching_dict:  # 古法提取
            if truncate3 != "":  # 提取到资格审查附件的情况
                print("资格评审： type1")
                matching_keys_list = ["资质条件", "财务要求", "业绩要求", "信誉要求", "其他要求"]
                ques = generate_qual_question(matching_keys_list)
                file_id2 = upload_file(truncate3)
                results2 = multi_threading(ques, "", file_id2, 2)  # 资格评审表，调用qianwen-long
                res_list = []
                if not results2:
                    print("资格评审： 调用大模型未成功获取资格评审文件中的要求！")
                else:
                    # 打印结果
                    for question, response in results2:
                        res_list.append(clean_json_string(response))  # 都是问资格评审表得出的
                if res_list:
                    merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
                    consortium_dict = get_consortium_dict(knowledge_name)
                    updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict)
                    return updated_qualify_json
                else:
                    print("资格评审： 无法获取大模型结果，返回空值")
                    return {"资格评审": ""}
            else:
                print("资格评审： type2")
                return get_all_dict(knowledge_name) or {"资格评审": ""}
        else:       # 此时要求全部写在评分办法前附表中，不需要额外提取。
            print("资格评审： type3")
            new_non_matching_json = {'资格评审': non_matching_dict}
            substring = '联合体'
            found_key = any(substring in key for key in non_matching_dict.keys())  # 没有联合体投标，则需生成，防止重复
            if not found_key:
                consortium_dict = get_consortium_dict(knowledge_name)
                final_qualify_json = add_keys_to_json(new_non_matching_json, consortium_dict)
                return final_qualify_json
            else:
                return new_non_matching_json or {"资格评审": ""}
    elif matching_keys_list and truncate3 == "":  # 这种情况是评分办法前附表中有要求，但是没有正确截取到'资格审查表'
        print("资格评审： type4")
        final_qualification = get_all_dict(knowledge_name)
        final_qualify_json = add_keys_to_json(final_qualification, non_matching_dict)
        return final_qualify_json or {"资格评审": ""}
    else:  # 大多数情况
        print("资格评审： type5")
        user_querys = generate_qual_question(matching_keys_list)  # 生成提问->‘附件：资格审查’
        file_id2 = upload_file(truncate3)
        results2 = multi_threading(user_querys, "", file_id2, 2)  # 资格评审表，调用qianwen-long
        res_list = []
        if not results2:
            print("资格评审： 调用大模型未成功获取资格评审文件中的要求！")
            return {"资格评审": ""}
        else:
            # 打印结果
            for question, response in results2:
                cleaned_res = clean_json_string(response)
                res_list.append(cleaned_res)  # 都是问资格评审表得出的
        merged_dict = merge_dictionaries_under_common_key(res_list, '资格评审')
        consortium_dict = get_consortium_dict(knowledge_name)
        updated_qualify_json = add_keys_to_json(merged_dict, consortium_dict)  # 合并字典
        final_qualify_json = add_keys_to_json(updated_qualify_json, non_matching_dict)
        return final_qualify_json or {"资格评审": ""}
 if __name__ == "__main__":
    # qualification_review={'营业执照': '具备有效的营业执照', '资质等级': '具备建设行政主管部门颁发的市政公用工程监理乙级及以上资质或房屋建筑工程监理乙级及以上资质或工程监理综合资质证书', '财务状况': '投标人须提供近三年（2018 年、2019 年、2020 年）完', '类似项目业绩': '投标人近 5 年（2017 年至今）须具有至少一项投资概算在4000 万元及以上房屋建筑工程或市政公用工程监理业绩，同时提供中标通知书及监理服务合同，项目规模及项目时间认定以监理服务合同内信息为准', '信誉': '根据《关于在招标投标活动中对失信被执行', '主要人员': '监理工程师：至少提供 1 名具有房屋建筑工程专','不存在禁止投标的':'不存在第二章“投标人须知”第 1.4.3 项规定的情形','联合体投标':'hha'}
    qualification_review = {'营业执照': '具备有效的营业执照', '安全生产许可证': '具备有效的安全生产许可证',
                            '资质等级': '符合第二章“投标人须知”规定', '财务状况': '符合第二章“投标人须知”规定'}
    truncate3 = "C:\\Users\\Administrator\\Desktop\\招标文件\\new_test\\zbtest2_qualification.pdf"
    knowledge_name = "招标解析word13"
    res = process_qualification(qualification_review, truncate3, knowledge_name)
    print(json.dumps(res, ensure_ascii=False, indent=4))
 # 该招标文件中资格评审关于财务状况的内容是怎样的？请你以json格式返回结果，外层键名为'财务状况'，请你忠于原文，回答要求完整准确，不要擅自总结、删减，且不要回答诸如'见投标人须知前附表'或'见第x.x项规定'这类无实质性内容的回答。
--- a/flask_app/old_version/资格评审前判断_old.py
+++ b/flask_app/old_version/资格评审前判断_old.py
@ -0,0 +1,29 @@
 from flask_app.general.读取文件.按页读取pdf import extract_text_by_page
 def check_strings_in_pdf(file_path):
    judge_list=['施工机械设备', '企业信息登记']
    # Read text from PDF
    text = extract_text_by_page(file_path)  # Assuming this returns all text from the PDF
    full_text = ''.join(text).replace('\n', '').replace(' ', '')  # Clean up the text
    # Initialize the questions list
    ques_list = []
    # Check for each string in the judge_list and construct questions accordingly
    if judge_list[0] in full_text:
        ques_list.append(f"该招标文件对于'{judge_list[0]}'的要求是怎样的，请按json格式给我提供信息，键名为'{judge_list[0]}'，若存在未知信息，在对应的键值中填'未知'。")
    if len(judge_list) > 1 and judge_list[1] in full_text:
        ques_list.append(f"该招标文件对于'{judge_list[1]}'的要求是怎样的，请按json格式给我提供信息，键名为'{judge_list[1]}'，若存在未知信息，在对应的键值中填'未知'。")
    if not ques_list:
        return None
    return ques_list
 # Test cases or example usage
 if __name__ == '__main__':
    file_path = 'C:/Users/Administrator/Desktop/zbtest18_39-45.pdf'  # Replace with your actual PDF file path
    judge_list = ['施工机械设备', '企业信息登记']  # List of strings to check in the PDF
    questions = check_strings_in_pdf(file_path, judge_list)
    for question in questions:
        print(question)
--- a/flask_app/old_version/通义千问_old.py
+++ b/flask_app/old_version/通义千问_old.py
@ -0,0 +1,48 @@
 import json
 import random
 from http import HTTPStatus
 from dashscope import Generation
 def call_with_messages(messages):
    response = Generation.call(model="qwen-max",
                               messages=messages,
                               seed=random.randint(1, 10000),
                               temperature=0.5,
                               top_p=0.5,
                               top_k=50,
                               result_format='message')
    if response.status_code == HTTPStatus.OK:
        content = response.output['choices'][0]['message']['content']
        return content
    else:
        raise Exception(f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}')
 def prepare_question_from_json(json_path, user_query):
    with open(json_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    question = json.dumps(json_data, ensure_ascii=False) + user_query
    return question
 #专用于判断是否
 def qianwen_ask(json_path, user_query):
    messages = []
    question = prepare_question_from_json(json_path, user_query)
    messages.append({'role': 'system', 'content': 'You are a helpful assistant.'})
    messages.append({'role': 'user', 'content': question})
    return call_with_messages(messages)
 #通用问题
 def qianwen_ask2(questions):
    messages = []
    messages.append({'role': 'system', 'content': 'You are a helpful assistant.'})
    messages.append({'role': 'user', 'content': questions})
    return call_with_messages(messages)
 if __name__ == '__main__':
    json_path = 'judge_exist.json'
    prompt = "请你依据以上信息回答，是否组织踏勘现场？是否召开投标预备会？是否允许偏离？是否退还投标文件？是否允许分包? 是否需要递交投标保证金？是否有履约保证金（履约担保）？是否有招标代理服务费？请按json格式给我提供信息，键名分别为'是否组织踏勘现场','是否召开投标预备会','是否允许偏离','是否退还投标文件',是否允许分包','是否需要递交投标保证金','是否有履约保证金','是否有招标代理服务费',键值仅限于'是','否','未知',若存在未知或者矛盾信息，请回答'未知'。"
    try:
        content = qianwen_ask(json_path, prompt)
        print(content)
    except Exception as e:
        print(f"An error occurred: {e}")