12.19 修复豆包模型使用的bug

2024-12-19 14:21:33 +08:00 · 2024-12-19 14:21:33 +08:00 · 867fc976b3
commit 867fc976b3
parent f5261087b4
3 changed files with 27 additions and 5 deletions
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@ -138,6 +138,15 @@ def pdf2docx(local_path_in):
 def doc2docx(local_path_in):
    if not local_path_in:
        return ""
+    # 获取文件名和所在文件夹
+    filename, folder = get_filename_and_folder(local_path_in)
+
+    # 检查是否已经存在同名的 .docx 文件
+    docx_file_path = os.path.join(folder, f"{filename}.docx")
+    if os.path.exists(docx_file_path):
+        print(f"Skipping conversion, {docx_file_path} already exists.")
+        return docx_file_path
+
    remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
    receive_download_url = upload_file(local_path_in, remote_url)
    print(receive_download_url)
@ -149,6 +158,13 @@ def doc2docx(local_path_in):
 def docx2pdf(local_path_in):
    if not local_path_in:
        return ""
+    # 获取文件名和所在文件夹
+    filename, folder = get_filename_and_folder(local_path_in)
+    # 检查是否已经存在同名的 .pdf 文件
+    pdf_file_path = os.path.join(folder, f"{filename}.pdf")
+    if os.path.exists(pdf_file_path):
+        print(f"Skipping conversion, {pdf_file_path} already exists.")
+        return pdf_file_path
    remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
    receive_download_url = upload_file(local_path_in, remote_url)
    filename, folder = get_filename_and_folder(local_path_in)  # 输入输出在同一个文件夹
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -372,7 +372,7 @@ def get_technical_requirements(invalid_path,processed_filepath,model_type=1):
    if '否' in judge_res or model_type==0:
        model_type = 0  # 使用qianwen-long+invalid_path
        print("no!调用invalid_path")
-        if invalid_path.lower().endswith('.pdf'):           #确保上传的是docx
+        if invalid_path.lower().endswith('.pdf'):           #确保上传的是docx  upload中一定是docx，但是get_deviation中可能上传的是pdf
            invalid_path = pdf2docx(invalid_path)
        file_id = upload_file(invalid_path)
        user_query = generate_prompt(judge_res)
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -28,12 +28,18 @@ def fetch_procurement_reqs(procurement_path, invalid_path):

    try:
        if procurement_path == invalid_path:
+            # 读取 PDF 页码数
            page_count = get_pdf_page_count(procurement_path)
+
+            if page_count > 80:  # 如果页码数大于 50
                model_type = 0
                processed_filepath = ""
            else:
                model_type = 1
                processed_filepath = convert_file_to_markdown(procurement_path)  # 转markdown格式
+        else:
+            model_type = 1
+            processed_filepath = convert_file_to_markdown(procurement_path)  # 转markdown格式
        # processed_filepath = pdf2txt(procurement_path)  # 纯文本提取
        # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
        with concurrent.futures.ThreadPoolExecutor() as executor: