Merge branch 'feature/fea-1' into 12-6-test

# Conflicts: # flask_app/general/doubao.py
2024-12-06 14:45:38 +08:00 · 2024-12-06 14:45:38 +08:00 · 94a710378a
commit 94a710378a
parent d2090e7fd6 d15c094578
4 changed files with 51 additions and 53 deletions
--- a/flask_app/general/file2markdown.py
+++ b/flask_app/general/file2markdown.py
@ -50,12 +50,12 @@ def convert_pdf_to_markdown(file_path):
    resp = textin.recognize_pdf2md(image, {
        'page_start': 0,
        'page_count': 50,  # 设置解析页数为50页
-        'table_flavor': 'md',  # html 按html语法输出表格
-        'parse_mode': 'scan',  # 设置解析模式为scan模式
+        'table_flavor': 'html',  # html 按html语法输出表格
+        'parse_mode': 'auto',  # 设置解析模式为scan模式
        'page_details': 0,  # 不包含页面细节
        'markdown_details': 1,
        'apply_document_tree': 1,
-        'dpi': 144  # 分辨率设置为144 dpi
+        'dpi': 216  # 分辨率设置默认为144 dpi
    })
    print("request time: ", resp.elapsed.total_seconds())
    data = json.loads(resp.text)
--- a/flask_app/货物标/商务服务其他要求提取.py
+++ b/flask_app/货物标/商务服务其他要求提取.py
@ -131,11 +131,11 @@ def generate_queries(truncate_file, required_keys):
            query_base += "也不需要回答\"{}\"中的内容，".format("\"和\"".join(other_keys))
        query_base += "若相关要求不存在，在键值中填'未知'。"
        queries.append(query_base)
-        # print(query_base)
    return queries


-def generate_template(required_keys, type=1):
+def generate_template(required_keys,processed_filepath, type=1):
+    full_text=read_txt_to_string(processed_filepath)
    # 定义每个键对应的示例内容
    example_content1 = {
        "技术要求": ["相关技术要求1", "相关技术要求2"],
@ -249,27 +249,26 @@ def generate_template(required_keys, type=1):
    {tech_json_example1_str}
    示例 2，嵌套键值对形式：
    {tech_json_example2_str}
+    
+    文件内容：{full_text}
    """
    return user_query_template

-def get_business_requirements(procurement_path,procurement_docpath):
-    file_id = upload_file(procurement_docpath)
-    print(file_id)
+def get_business_requirements(procurement_path,processed_filepath):
    required_keys = ["技\s*术\s*要\s*求", "商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求","总\s*体\s*要\s*求","建\s*设\s*要\s*求","进\s*度\s*要\s*求","工\s*期\s*要\s*求","质\s*保\s*要\s*求","培\s*训\s*要\s*求","售\s*后\s*要\s*求"]
    contained_keys = find_exists(procurement_path, required_keys)
    print(contained_keys)
    if not contained_keys:
        return {}
-    # queries = generate_queries(truncate_file, contained_keys)
-    busi_user_query = generate_template(contained_keys, 1)
-    tech_user_query = generate_template(contained_keys, 2)
+    busi_user_query = generate_template(contained_keys, processed_filepath,1)
+    tech_user_query = generate_template(contained_keys, processed_filepath,2)
    final_res={}
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = []
        if busi_user_query:
-            futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1))
+            futures.append(executor.submit(doubao_model,busi_user_query))
        if tech_user_query:
-            futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1))
+            futures.append(executor.submit(doubao_model,tech_user_query))
        # 获取结果
        for future in concurrent.futures.as_completed(futures):
            try:
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -353,29 +353,31 @@ def generate_prompt(judge_res, full_text=None):
    base_prompt += "\n注意事项：\n1.严格按照上述要求执行，确保输出准确性和规范性。\n"
    return base_prompt

-#文件内容以markdown格式组织，其中表格部分（若有）以html语法组织，
-def get_technical_requirements(file_path,invalid_path,processed_filepath):
-    # docx_file_path=pdf2docx(file_path)
-    file_id=upload_file(file_path) #目前传入的为docx文档
-    first_query_template="该文件是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'"   #防止截取失败
-    judge_res=qianwen_long(file_id,first_query_template)
+def get_technical_requirements(invalid_path,processed_filepath):
+    file_id=""
+    model_type=1   #默认使用豆包
+    # 防止截取失败
+    first_query_template="""该文件是否说明了采购需求,即需要采购哪些货物?如果有,请回答'是',否则,回答'否'  
+       文件内容：{full_text}
+       """
+    judge_query=generate_full_user_query(processed_filepath,first_query_template)
+    judge_res=doubao_model(judge_query)
    if '否' in judge_res:
+        model_type=0    #使用qianwen-long+invalid_path
        print("no!调用invalid_path")
        file_id=upload_file(invalid_path)
        user_query = generate_prompt(judge_res)
        model_res=qianwen_long(file_id,user_query)
        print(model_res)
    else:
-        # processed_filepath = convert_pdf_to_markdown(file_path)   # 转markdown格式
-        # processed_filepath=r"C:\Users\Administrator\Desktop\货物标\extract_files\107国道.txt"
        full_text = read_txt_to_string(processed_filepath)
        user_query=generate_prompt(judge_res,full_text)
        model_res=doubao_model(user_query)
        print(model_res)
    cleaned_res = clean_json_string(model_res)     #转字典
-    processed_data=truncate_system_keys(cleaned_res['采购需求'])
+    processed_data=truncate_system_keys(cleaned_res['采购需求'])  #防止嵌套层级过长
    key_paths, grouped_paths, good_list, data_copy= generate_key_paths(processed_data)  # 提取需要采购的货物清单 key_list：交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2'，提取'交换机'  ，输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
-    modified_data=rename_keys(data_copy)
+    modified_data=rename_keys(data_copy)   #
    user_query_template = """请根据货物标中采购要求部分的内容，告诉我\"{}\"的技术参数或采购要求是什么。请以 JSON 格式返回结果，键名为\"{}\"，键值为一个列表，列表中包含若干描述\"{}\"的技术参数或采购要求的字符串，请按原文内容回答，保留三角▲、五角★和序号，不可擅自增删内容，尤其是不可擅自添加序号。

 要求与指南：
@ -401,6 +403,8 @@ def get_technical_requirements(file_path,invalid_path,processed_filepath):
        "协议：routes 接口开放：具备；▲支持标准 ONVIF 协议与第三方厂家设备进行互联；支持 GB/T28181；应提供 SDK"
    ]
 }}
+
+文件内容:{}
 """
    user_query_template_two="""请根据货物标中采购要求部分的内容，告诉我\"{}\"的技术参数或采购要求是什么。由于该货物存在 {} 种不同的采购要求或技术参数，请逐一列出，并以 JSON 格式返回结果。请以'货物名-编号'区分多种型号，编号为从 1 开始的自然数，依次递增，即第一个键名为\"{}-1\"；键值为一个列表，列表中包含若干描述\"{}\"的技术参数（或采购要求）的字符串，请按原文内容回答，保留三角▲、五角★和序号（若有），不可擅自增删内容，尤其是不可擅自添加序号。

@ -434,31 +438,35 @@ def get_technical_requirements(file_path,invalid_path,processed_filepath):
        "支持夜视", "支持云存储"
    ]
 }}
+
+文件内容：{}
        """
    queries = []
    for key in key_paths:
        # 将键中的 '.' 替换为 '下的'
        modified_key = key.replace('.', '下的')
        # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
-        # full_text = read_txt_to_string(processed_filepath)
-        # new_query = user_query_template.format(modified_key, key, modified_key,full_text)   #转豆包后取消注释
-        new_query = user_query_template.format(modified_key, key, modified_key)
+        if model_type:
+            full_text = read_txt_to_string(processed_filepath)
+            new_query = user_query_template.format(modified_key, key, modified_key,full_text)   #转豆包后取消注释
+        else:
+            new_query = user_query_template.format(modified_key, key, modified_key)
        queries.append(new_query)
-
        # 处理 grouped_paths 中的项，应用 user_query_template_two
    for grouped_dict in grouped_paths:
        for grouped_key, grouped_key_cnt in grouped_dict.items():
            # 将键中的 '.' 替换为 '下的'
            modified_grouped_key = grouped_key.replace('.', '下的')
-            # 使用修改后的键填充第一个占位符，原始键填充第二个占位符
-            # 如果需要使用 full_text，可以取消注释并提供相应的实现
-            # full_text = read_txt_to_string(processed_filepath)
-            # new_query = user_query_template_two.format(modified_grouped_key, grouped_key, modified_grouped_key, full_text)
-            # 根据您的需求，生成新的查询字符串
-            new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt,grouped_key, modified_grouped_key)
+            if model_type:
+                full_text = read_txt_to_string(processed_filepath)
+                new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt,grouped_key, modified_grouped_key, full_text)
+            else:
+                new_query = user_query_template_two.format(modified_grouped_key, grouped_key_cnt, grouped_key,modified_grouped_key)
            queries.append(new_query)
-    results = multi_threading(queries, "", file_id, 2)  #通义
-    # results = multi_threading(queries, "", "", 3)   #豆包
+    if model_type:
+        results = multi_threading(queries, "", "", 3)   #豆包
+    else:
+        results = multi_threading(queries, "", file_id, 2)  # 豆包
    technical_requirements = []
    if not results:
        print("errror!未获得大模型的回答！")
@ -505,15 +513,12 @@ if __name__ == "__main__":
    # truncate_file="C:\\Users\\Administrator\\Desktop\\fsdownload\\469d2aee-9024-4993-896e-2ac7322d41b7\\ztbfile_procurement.docx"
    truncate_docfile=r"C:\Users\Administrator\Desktop\货物标\output1\6_2定版视频会议磋商文件_procurement.docx"
    truncate_file=r'C:\Users\Administrator\Desktop\货物标\output1\6.2定版视频会议磋商文件_procurement.pdf'
-    # invalid_path="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile.pdf"
-    # truncate_file="D:\\flask_project\\flask_app\\static\\output\\output1\\e7dda5cb-10ba-47a8-b989-d2993d34bb89\\ztbfile_procurement.docx"
-    # output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\tmp"
-    # file_id = upload_file(truncate_file)
    invalid_path=r"C:\Users\Administrator\Desktop\fsdownload\a110ed59-00e8-47ec-873a-bd4579a6e628\ztbfile.pdf"
    # file_id=upload_file(truncate_file)
-    processed_filepath = pdf2txt(truncate_file)
-    # processed_filepath=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\金水河沿线排涝泵站提档升级项目.txt"
-    res=get_technical_requirements(truncate_docfile,invalid_path,processed_filepath)
+    # processed_filepath = pdf2txt(truncate_file)
+    # processed_filepath = convert_pdf_to_markdown(truncate_file)
+    processed_filepath=r"C:\Users\Administrator\Desktop\货物标\extract_files\6.2定版视频会议磋商文件html.txt"
+    res=get_technical_requirements(invalid_path,processed_filepath)
    json_string = json.dumps(res, ensure_ascii=False, indent=4)
    print(json_string)
    # # input_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -12,7 +12,6 @@ from flask_app.货物标.商务服务其他要求提取 import get_business_requ

 # 获取采购清单
 def fetch_procurement_reqs(procurement_path, invalid_path):
-    procurement_docpath = pdf2docx(procurement_path)  # 采购需求docx
    # 定义默认的 procurement_reqs 字典
    DEFAULT_PROCUREMENT_REQS = {
        "采购需求": "",
@ -27,16 +26,15 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
        return DEFAULT_PROCUREMENT_REQS.copy()

    try:
-        # processed_filepath = convert_pdf_to_markdown(procurement_path)   # 转markdown格式
-        processed_filepath = pdf2txt(procurement_path)  # 纯文本提取
+        processed_filepath = convert_pdf_to_markdown(procurement_path)   # 转markdown格式
+        # processed_filepath = pdf2txt(procurement_path)  # 纯文本提取
        # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
        with concurrent.futures.ThreadPoolExecutor() as executor:

            # 提交任务给线程池
-            future_technical = executor.submit(get_technical_requirements, procurement_docpath, invalid_path,
-                                               processed_filepath)
+            future_technical = executor.submit(get_technical_requirements, invalid_path,processed_filepath)
            time.sleep(0.5)  # 保持原有的延时
-            future_business = executor.submit(get_business_requirements, procurement_path, procurement_docpath)
+            future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath)

            # 获取并行任务的结果
            technical_requirements = future_technical.result()
@ -49,10 +47,6 @@ def fetch_procurement_reqs(procurement_path, invalid_path):

        procurement_reqs.update(business_requirements)

-        # 如果需要确保所有默认键存在，可以取消下面的注释
-        # for key, default_value in DEFAULT_PROCUREMENT_REQS.items():
-        #     procurement_reqs.setdefault(key, default_value)
-
        return procurement_reqs

    except Exception as e:
@ -67,7 +61,7 @@ if __name__ == "__main__":
    start_time = time.time()
    output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\货物标output"
    # file_path="C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件（2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目）_procurement.pdf"
-    procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\陕西省公安厅交通警察总队高速公路交通安全智能感知巡查系统项目 (1)_procurement.pdf"
+    procurement_path = r"C:\Users\Administrator\Desktop\货物标\output1\6.2定版视频会议磋商文件_procurement.pdf"
    procurement_docpath = r"C:\Users\Administrator\Desktop\fsdownload\fa0d51a1-0d63-4c0d-9002-cf8ac3f2211a"
    invalid_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\db79e9e0-830e-442c-8cb6-1d036215f8ff\\ztbfile.pdf"
    res = fetch_procurement_reqs(procurement_path, invalid_path)