12.19 invalid_path转md格式前增加判断

2024-12-19 15:12:42 +08:00 · 2024-12-19 15:12:42 +08:00 · e19aaa04f6
commit e19aaa04f6
parent 27830d271c
2 changed files with 44 additions and 9 deletions
--- a/flask_app/货物标/商务服务其他要求提取.py
+++ b/flask_app/货物标/商务服务其他要求提取.py
@ -7,7 +7,7 @@ from flask_app.general.doubao import read_txt_to_string, pdf2txt
 from flask_app.general.json_utils import combine_json_results, clean_json_string
 from flask_app.general.通义千问long import upload_file, qianwen_long_stream
 from flask_app.货物标.截取pdf货物标版 import extract_common_header, clean_page_content
-from flask_app.general.format_change import docx2pdf
+from flask_app.general.format_change import docx2pdf, pdf2docx
 import concurrent.futures
 from flask_app.general.doubao import doubao_model

@ -255,26 +255,60 @@ def generate_template(required_keys,full_text, type=1):
        user_query_template += f"\n\n文件内容：{full_text}"
    return user_query_template

-def get_business_requirements(procurement_path,processed_filepath):
-    required_keys = ["技\s*术\s*要\s*求", "商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求","总\s*体\s*要\s*求","建\s*设\s*要\s*求","进\s*度\s*要\s*求","工\s*期\s*要\s*求","质\s*保\s*要\s*求","培\s*训\s*要\s*求","售\s*后\s*要\s*求"]
-    procurement_pdf_path=procurement_path
+
+def get_business_requirements(procurement_path, processed_filepath, model_type):
+    required_keys = ["技\s*术\s*要\s*求", "商\s*务\s*要\s*求", "服\s*务\s*要\s*求", "其\s*他\s*要\s*求",
+                     "总\s*体\s*要\s*求", "建\s*设\s*要\s*求", "进\s*度\s*要\s*求", "工\s*期\s*要\s*求",
+                     "质\s*保\s*要\s*求", "培\s*训\s*要\s*求", "售\s*后\s*要\s*求"]
+
+    # 将 doc/docx 转换为 pdf
+    procurement_pdf_path = procurement_path
    if procurement_path.lower().endswith(('.doc', '.docx')):
        procurement_pdf_path = docx2pdf(procurement_path)
+
+    # 查找包含的关键词
    contained_keys = find_exists(procurement_pdf_path, required_keys)
    print(contained_keys)
    if not contained_keys:
        return {}
-    # queries = generate_queries(truncate_file, contained_keys)
+
+    # 读取文件全文
    full_text = read_txt_to_string(processed_filepath)
+
+    # 生成业务查询和技术查询
    busi_user_query = generate_template(contained_keys, full_text, 1)
    tech_user_query = generate_template(contained_keys, full_text, 2)
-    final_res={}
+
+    # 初始化结果存储
+    final_res = {}
+
+    # 如果是非模型调用，需要提前上传文件并获取 file_id
+    file_id = None
+    if not model_type:
+        procurement_docx_path=procurement_path
+        if procurement_path.lower().endswith('.pdf'):
+            procurement_docx_path = pdf2docx(procurement_path)
+        file_id = upload_file(procurement_docx_path)  # 只上传一次文件，避免冗余调用
+
+    # 并行处理业务和技术查询
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = []
        if busi_user_query:
-            futures.append(executor.submit(doubao_model, busi_user_query))
+            if model_type:
+                # 如果是模型调用，直接使用 doubao_model
+                futures.append(executor.submit(doubao_model, busi_user_query))
+            else:
+                # 使用 qianwen_long_stream 并传入 file_id
+                futures.append(executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1))
+
        if tech_user_query:
-            futures.append(executor.submit(doubao_model, tech_user_query))
+            if model_type:
+                # 如果是模型调用，直接使用 doubao_model
+                futures.append(executor.submit(doubao_model, tech_user_query))
+            else:
+                # 使用 qianwen_long_stream 并传入 file_id
+                futures.append(executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1))
+
        # 获取结果
        for future in concurrent.futures.as_completed(futures):
            try:
@ -283,6 +317,7 @@ def get_business_requirements(procurement_path,processed_filepath):
                    final_res.update(clean_json_string(result))
            except Exception as e:
                print(f"An error occurred: {e}")
+
    return final_res


--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -47,7 +47,7 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
            # 提交任务给线程池
            future_technical = executor.submit(get_technical_requirements, invalid_path, processed_filepath,model_type)
            time.sleep(0.5)  # 保持原有的延时
-            future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath)
+            future_business = executor.submit(get_business_requirements, procurement_path, processed_filepath,model_type)
            # 获取并行任务的结果
            technical_requirements = future_technical.result()
            business_requirements = future_business.result()