11.20 修改bug

2024-11-20 19:35:22 +08:00 · 2024-11-20 19:35:22 +08:00 · b97ae7c034
commit b97ae7c034
parent 6dd1e02290
6 changed files with 102 additions and 83 deletions
--- a/flask_app/general/post_processing.py
+++ b/flask_app/general/post_processing.py
@ -106,7 +106,7 @@ def extract_business_requirements(data):
    model_res1 = doubao_model(user_query1)
    # print(model_res)
    business_req_deviation = clean_json_string(model_res1)
-    prompt_template2 = """以下文本是项目采购需求的商务要求部分，请你帮我从键值列表中各字符串中提取带星★或带三角▲的要求项，你的返回格式同输入文本格式，外键名为'商务要求（带星）'，键值为字符串列表，其中每个字符串为带星★或带三角▲的要求项。
+    prompt_template2 = """以下文本是项目采购需求的商务要求部分，请你帮我从键值列表中各字符串中提取带星★或带三角▲的要求项，你的返回格式同输入文本格式，外键名为'商务要求带星'，键值为字符串列表，其中每个字符串为带星★或带三角▲的要求项。
    要求与指南：
    1. 每个星★或三角▲要求占据一个字符串。
    2. 若没有带星★或带三角▲的要求项，键值为空列表，即[]
@ -121,7 +121,7 @@ def extract_business_requirements(data):
    }}
    ### 对应的输出如下：
    {{
-        "商务要求（带星）": [
+        "商务要求带星": [
            "★交货期（工期）：合同签订之日起 15个日历天内完成，并通过项目验收。",
            "▲本项目报价须为固定总价，包含但不限于：采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。"
        ]
--- a/flask_app/general/多线程提问.py
+++ b/flask_app/general/多线程提问.py
@ -6,10 +6,12 @@ import re
 import queue
 import concurrent.futures
 import time
 from datetime import datetime
 import requests
 from dashscope import Assistants, Messages, Runs, Threads
 from llama_index.indices.managed.dashscope import DashScopeCloudRetriever
-from flask_app.general.通义千问long import qianwen_long
+from flask_app.general.通义千问long import qianwen_long, upload_file
 prompt = """
 # 角色
@ -241,6 +243,10 @@ def llm_call(question, knowledge_name,file_id, result_queue, ans_index, llm_type
        # assistant=create_assistant(knowledge_name)
    elif llm_type==2:
        print(f"qianwen_long! question:{question}")
        # 获取当前时间
        current_time = datetime.now()
        # 输出时分秒
        print(current_time.strftime("%H:%M:%S.%f")[:-3])
        # qianwen_res,usage = qianwen_long(file_id,question)   #有bug
        qianwen_res = qianwen_long(file_id, question)
        result_queue.put((ans_index,(question,qianwen_res)))
@ -257,68 +263,72 @@ def multi_threading(queries, knowledge_name="", file_id="", llm_type=1):
    print("多线程提问：starting multi_threading...")
    result_queue = queue.Queue()
    max_retries = 2  # 设置最大重试次数
-    # 使用 ThreadPoolExecutor 管理线程
+    retry_counts = {}  # 跟踪每个查询的重试次数
    with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
        # 逐个提交任务，每提交一个任务后休眠1秒
        future_to_query = {}
        for index, query in enumerate(queries):
            time.sleep(0.5)  # 每提交一个任务后等待0.5秒
            future = executor.submit(llm_call, query, knowledge_name, file_id, result_queue, index, llm_type)
            future_to_query[future] = index
-            time.sleep(0.5)  # 每提交一个任务后等待1秒
+            retry_counts[index] = 0  # 初始化重试次数
-        # 收集每个线程的结果
+        while future_to_query:
-        for future in concurrent.futures.as_completed(future_to_query):
+            done, _ = concurrent.futures.wait(
                future_to_query.keys(),
                return_when=concurrent.futures.FIRST_COMPLETED
            )
            for future in done:
                index = future_to_query[future]
-            retries = 0
+                del future_to_query[future]
                try:
                    future.result()  # 捕获异常或确认任务完成
                except Exception as exc:
-                # print(f"Query {index} generated an exception: {exc}")
+                    print(f"Query {index} generated an exception: {exc}")
-                retries += 1  # 增加重试计数
+                    retry_counts[index] += 1  # 增加重试计数
-                # 确保在异常情况下也向 result_queue 添加占位符
+                    if retry_counts[index] <= max_retries:
-                result_queue.put((index, None))
+                        print(f"Retrying query {index} (attempt {retry_counts[index]})...")
                if retries < max_retries:
                    print(f"Retrying query {index} (attempt {retries + 1})...")
                        print("重试的问题：" + queries[index])
                        # 重新提交任务
-                    future = executor.submit(llm_call, queries[index], knowledge_name, file_id, result_queue, index, llm_type)      #可能遇到阿里服务器挂壁的情况，重试一下
+                        new_future = executor.submit(llm_call, queries[index], knowledge_name, file_id, result_queue, index, llm_type)
-                    future_to_query[future] = index
+                        future_to_query[new_future] = index
                    else:
                        print(f"Query {index} failed after {max_retries} attempts.")
-                    break  # 超过最大重试次数，退出循环
+                        result_queue.put((index, None))  # 添加占位符
    # 从队列中获取所有结果并按索引排序
    results = [None] * len(queries)
    while not result_queue.empty():
        index, result = result_queue.get()
        results[index] = result
    # 检查是否所有结果都是 None
    if all(result is None for result in results):
        return []
    # 过滤掉None值
    results = [r for r in results if r is not None]
    # 返回一个保证是列表的结构
    return results
 if __name__ == "__main__":
    start_time=time.time()
-    # # 读取问题列表
+    # # # 读取问题列表
-    baseinfo_file_path = '/flask_app/static/提示词/基本信息工程标.txt'
+    # baseinfo_file_path = '/flask_app/static/提示词/基本信息工程标.txt'
-    questions =read_questions_from_file(baseinfo_file_path)
+    # questions =read_questions_from_file(baseinfo_file_path)
-    knowledge_name = "招标解析5word"
+    # knowledge_name = "招标解析5word"
-    llm_type=1
+    # llm_type=1
-    results = multi_threading(questions, knowledge_name)
+    # results = multi_threading(questions, knowledge_name)
-    end_time = time.time()
+    # end_time = time.time()
-    if not results:
+    # if not results:
-        print("errror!")
+    #     print("errror!")
-    else:
+    # else:
-        print("elapsed time:"+str(end_time-start_time))
+    #     print("elapsed time:"+str(end_time-start_time))
-        # 打印结果
+    #     # 打印结果
-        for question, response in results:
+    #     for question, response in results:
-            print(f"Response: {response}")
+    #         print(f"Response: {response}")
-    # file_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\6.2定版视频会议磋商文件(1)\\6.2定版视频会议磋商文件_1-21.pdf"
+    file_path = r"C:\Users\Administrator\Desktop\fsdownload\39b0c3b4-1807-456c-8330-c5c7d1b7a2ca\ztbfile_procurement\ztbfile_procurement_1.pdf"
-    # file_id = upload_file(file_path)
+    file_id = upload_file(file_path)
    # questions=["该招标文件的项目名称是？项目编号（或招标编号）是？采购人（或招标人）是？采购代理机构（或招标代理机构）是？请按json格式给我提供信息，键名分别是'项目名称','项目编号','采购人','采购代理机构'，若存在未知信息，在对应的键值中填'未知'。","该招标文件的项目概况是？项目基本情况是？请按json格式给我提供信息，键名分别为'项目概况','项目基本情况'，若存在嵌套信息，嵌套内容键名以文件中对应字段命名，而嵌套键值必须与原文保持一致，若存在未知信息，在对应的键值中填'未知'。"]
    # results=multi_threading(questions,"",file_id,2)  #1代表使用百炼rag 2代表使用qianwen-long
    # if not results:
@ -339,3 +349,9 @@ if __name__ == "__main__":
    #     for question, response in results:
    #         print(f"Question: {question}")
    #         print(f"Response: {response}")
    query=[]
    for i in range(1,50):
        query.append("请返回这个数字："+str(i))
    res=multi_threading(query,"",file_id,2)
    for _,response in res:
        print(response)
--- a/flask_app/general/纯技术参数要求提取.py
+++ b/flask_app/general/纯技术参数要求提取.py
@ -47,7 +47,7 @@ def get_technical_requirements_main(file_path,file_type,unique_id,output_folder)
    else:
        return final_res
 if __name__ == "__main__":
-    file_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
+    file_path=r"C:\Users\Administrator\Desktop\fsdownload\39b0c3b4-1807-456c-8330-c5c7d1b7a2ca\ztbfile.pdf"
    file_type=2
    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\39b0c3b4-1807-456c-8330-c5c7d1b7a2ca\tmp"
    res=get_technical_requirements_main(file_path,file_type,"123",output_folder)
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -5,6 +5,7 @@ import re
 import time
 from flask_app.general.file2markdown import convert_pdf_to_markdown
 from flask_app.general.format_change import pdf2docx
 from flask_app.general.多线程提问 import multi_threading
 from flask_app.general.通义千问long import qianwen_long, upload_file
 from flask_app.general.json_utils import clean_json_string, combine_json_results
@ -209,10 +210,11 @@ def combine_and_update_results(original_data, updates):
 #文件内容以markdown格式组织，其中表格部分（若有）以html语法组织，
 def get_technical_requirements(file_path,invalid_path):
-    file_id=upload_file(file_path)
+    docx_file_path=pdf2docx(file_path)
-    first_query_template="该文件是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'"
+    print(docx_file_path)
-    # first_query=generate_full_user_query(file_path,first_query_template)
+    file_id=upload_file(docx_file_path)
-    # judge_res=doubao_model(first_query)
+    # file_id='file-fe-v6T6MGCW83b0m5uxHyP8IAoh'
    first_query_template="该文件是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'"   #防止截取失败
    judge_res=qianwen_long(file_id,first_query_template)
    prompt_template1 = '''
                    任务：解析采购文件，提取采购需求，并以JSON格式返回。
@ -230,26 +232,25 @@ def get_technical_requirements(file_path,invalid_path):
                        1.JSON格式，最外层键名为'采购需求'。
                        2.层次关系用嵌套键值对表示。
                        3.嵌套键名为系统或货物或模块名称，与原文保持一致。
-                        4.最内层键值应为空对象（{{}}）。
+                        4.最内层键值应为空列表[]。
                        5.不包含'说明'、'规格'、'技术参数'等列内容，仅返回采购的货物或系统或模块名称。
                        特殊情况处理：
                        同一层级（如同一系统中）下同名但采购要求不同的货物，以'货物名-编号'区分，编号从1递增。
                        示例输出1，普通系统、货物类采购：
                        {{
                            "采购需求": {{
-                                "交换机-1": {{}},
+                                "交换机-1": [],
-                                "交换机-2": {{}},
+                                "交换机-2": [],
                                "门禁管理系统": {{
-                                    "系统功能":{{}}
+                                    "系统功能":[]
                                }},
                                "交通监控视频子系统": {{
-                                    "系统功能": {{}},
+                                    "系统功能": [],
-                                    "高清视频抓拍像机": {{}},
+                                    "高清视频抓拍像机": [],
-                                    "补光灯": {{}}
+                                    "补光灯": []
                                }},
-                                "LED全彩显示屏": {{}}
+                                "LED全彩显示屏": []
                                // 其他系统和货物
                            }}
                        }}
@ -257,15 +258,15 @@ def get_technical_requirements(file_path,invalid_path):
                        {{
                            "采购需求": {{
                                "信息管理系统": {{
-                                    "通用模块":{{}},
+                                    "通用模块":[],
-                                    "用户管理":{{}}
+                                    "用户管理":[]
                                }},
                                "信息检索系统": {{
-                                    "系统功能":{{}},
+                                    "系统功能":[],
-                                    "权限管理模块":{{}}
+                                    "权限管理模块":[]
                                }},
-                                "XX小程序":{{}},
+                                "XX小程序":[],
-                                "数据分析中心":{{}}
+                                "数据分析中心":[]
                            }}
                        }}
@ -289,7 +290,7 @@ def get_technical_requirements(file_path,invalid_path):
                        1.JSON格式，最外层键名为'采购需求'。
                        2.层次关系用嵌套键值对表示。
                        3.嵌套键名为系统或货物或模块名称，与原文保持一致。
-                        4.最内层键值应为空对象（{{}}）。
+                        4.最内层键值应为空列表[]。
                        5.不包含'说明'、'规格'、'技术参数'等列内容，仅返回采购的货物或系统或模块名称。
                        特殊情况处理：
@ -298,17 +299,17 @@ def get_technical_requirements(file_path,invalid_path):
                        示例输出1，普通系统、货物类采购：
                        {{
                            "采购需求": {{
-                                "交换机-1": {{}},
+                                "交换机-1": [],
-                                "交换机-2": {{}},
+                                "交换机-2": [],
                                "门禁管理系统": {{
-                                    "系统功能":{{}}
+                                    "系统功能":[]
                                }},
                                "交通监控视频子系统": {{
-                                    "系统功能": {{}},
+                                    "系统功能": [],
-                                    "高清视频抓拍像机": {{}},
+                                    "高清视频抓拍像机": [],
-                                    "补光灯": {{}}
+                                    "补光灯": []
                                }},
-                                "LED全彩显示屏": {{}}
+                                "LED全彩显示屏": []
                                // 其他系统和货物
                            }}
                        }}
@ -316,15 +317,15 @@ def get_technical_requirements(file_path,invalid_path):
                        {{
                            "采购需求": {{
                                "信息管理系统": {{
-                                    "通用模块":{{}},
+                                    "通用模块":[],
-                                    "用户管理":{{}}
+                                    "用户管理":[]
                                }},
                                "信息检索系统": {{
-                                    "系统功能":{{}},
+                                    "系统功能":[],
-                                    "权限管理模块":{{}}
+                                    "权限管理模块":[]
                                }},
-                                "XX小程序":{{}},
+                                "XX小程序":[],
-                                "数据分析中心":{{}}
+                                "数据分析中心":[]
                            }}
                        }}
@ -432,7 +433,7 @@ def get_technical_requirements(file_path,invalid_path):
            # 打印结果
            for question, response in results:
                technical_requirements.append(response)
-                # print(response)
+                print(response)
        technical_requirements_combined_res = combine_json_results(technical_requirements)
        """根据所有键是否已添加处理技术要求"""
--- a/flask_app/货物标/货物标解析main.py
+++ b/flask_app/货物标/货物标解析main.py
@ -240,6 +240,8 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
 #TODO:把所有未知都删掉。
 #TODO:考虑把解析失败的调用豆包，全文上传。
 #TODO:写个脚本确保技术参数没有嵌套
 #TODO:C:\Users\Administrator\Desktop\fsdownload\16fd6b4e-3975-4c83-8ba6-1bc9263a6a5b 符合性审查未找到
 #商务标这里改为列表最里层
 #good_list 金额  截取上下文
 if __name__ == "__main__":
--- a/flask_app/货物标/资格审查main.py
+++ b/flask_app/货物标/资格审查main.py
@ -655,14 +655,14 @@ if __name__ == "__main__":
    output_folder=r"D:\flask_project\flask_app\static\output\output1\c911b0f8-0ff4-4718-80e3-86f464f313d3"
    # qualification_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_qualification1.pdf"
    # qualification_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_qualification2.pdf"
-    qualification_path=r"D:\flask_project\flask_app\static\output\output1\c911b0f8-0ff4-4718-80e3-86f464f313d3\ztbfile_qualification.pdf"
+    qualification_path=r"C:\Users\Administrator\Desktop\fsdownload\16fd6b4e-3975-4c83-8ba6-1bc9263a6a5b\ztbfile_qualification1.pdf"
    # notice_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\6558a50a-13ea-4279-a5db-684935481c39\\ztbfile_notice.pdf"
    # notice_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile_notice.pdf"
-    notice_path=r"D:\flask_project\flask_app\static\output\output1\c911b0f8-0ff4-4718-80e3-86f464f313d3\ztbfile_notice.pdf"
+    notice_path=r"C:\Users\Administrator\Desktop\fsdownload\16fd6b4e-3975-4c83-8ba6-1bc9263a6a5b\ztbfile_notice.pdf"
    # knowledge_name = "6.2视频会议docx"
    # invalid_path = "D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
    # invalid_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\52e54b20-c975-4cf3-a06b-6f146aaa93f5\\ztbfile.pdf"
-    invalid_path=r"D:\flask_project\flask_app\static\output\output1\c911b0f8-0ff4-4718-80e3-86f464f313d3\ztbfile_invalid.pdf"
+    invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\16fd6b4e-3975-4c83-8ba6-1bc9263a6a5b\ztbfile_invalid.pdf"
    res = combine_qualification_review(invalid_path, qualification_path, notice_path)
    print(json.dumps(res, ensure_ascii=False, indent=4))
    end_time=time.time()