12.20 豆包大模型bug解决

2024-12-20 17:47:56 +08:00 · 2024-12-20 17:47:56 +08:00 · 008df2fc4a
commit 008df2fc4a
parent de254d3c29
2 changed files with 11 additions and 9 deletions
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -325,7 +325,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达
                if any(exclude in data for exclude in excludes):
                    continue  # 如果包含任何排除字符串，跳过这个数据
                # 去掉开头的序号，eg:1 | (1) |（2） | 1. | 2．（全角点）| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
-                pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]?|[一二三四五六七八九十]+、)'
+                pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、)'
                data = re.sub(pattern, '', data).strip()
                keyword_match = re.search(keywords, data)
                if keyword_match:
@ -355,7 +355,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes):  # 让正则表达
            # print("*********")
            new_text_list = preprocess_text_list(text_list)
            # 用于处理结构化文本，清理掉不必要的序号，并将分割后的段落合并，最终形成更简洁和格式化的输出。
-            pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]?|[一二三四五六七八九十]+、)'
+            pattern = r'^\s*(?:[（(]\d+[)）)]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、．)\）]+|[一二三四五六七八九十]+、)'

            data = re.sub(pattern, '', new_text_list[0]).strip()  # 去除序号
            # 将修改后的第一个元素和剩余的元素连接起来
@ -632,6 +632,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
    print("无效标与废标done...")
    return {"无效标与废标项": combined_dict}

+#TODO:无效标这里提示词修改一下
 if __name__ == '__main__':
    start_time = time.time()
    # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
@ -641,9 +642,10 @@ if __name__ == '__main__':
    # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
    pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf'

-    output_dir = r"C:\Users\Administrator\Desktop\new招标文件\tmp\new招标文件\tmp"
-    invalid_added=insert_mark(pdf_path)
-    invalid_added_docx=pdf2docx(invalid_added)
+    output_dir = r"C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\tmp"
+    # invalid_added=insert_mark(pdf_path)
+    # invalid_added_docx=pdf2docx(invalid_added)
+    invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\invalid_added.docx'
    results = combine_find_invalid(invalid_added_docx, output_dir)
    end_time = time.time()
    print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
--- a/flask_app/货物标/商务服务其他要求提取.py
+++ b/flask_app/货物标/商务服务其他要求提取.py
@ -300,7 +300,7 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
        if busi_user_query:
            if model_type:
                # 如果是模型调用，直接使用 doubao_model
-                future = executor.submit(doubao_model, busi_user_query)
+                future = executor.submit(doubao_model, busi_user_query,True)
            else:
                # 使用 qianwen_long_stream 并传入 file_id
                future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1,True)
@ -310,7 +310,7 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
        if tech_user_query:
            if model_type:
                # 如果是模型调用，直接使用 doubao_model
-                future = executor.submit(doubao_model, tech_user_query)
+                future = executor.submit(doubao_model, tech_user_query,True)
            else:
                # 使用 qianwen_long_stream 并传入 file_id
                future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1,True)
@ -348,11 +348,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
 if __name__ == "__main__":
    # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
    # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件（广水市教育局封闭管理）_procurement.pdf"
-    procurement_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf'
+    procurement_path=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\ztbfile_procurement.pdf'
    docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
    # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
    # file_id = upload_file(truncate_file)
    # processed_filepath = pdf2txt(procurement_path)
-    processed_filepath=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\extract1.txt'
+    processed_filepath=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\extract1.txt'
    final_res= get_business_requirements(procurement_path,processed_filepath,1)
    print(json.dumps(final_res, ensure_ascii=False, indent=4))