diff --git a/flask_app/general/无效标和废标公共代码.py b/flask_app/general/无效标和废标公共代码.py index 952d418..026ebb6 100644 --- a/flask_app/general/无效标和废标公共代码.py +++ b/flask_app/general/无效标和废标公共代码.py @@ -325,7 +325,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达 if any(exclude in data for exclude in excludes): continue # 如果包含任何排除字符串,跳过这个数据 # 去掉开头的序号,eg:1 | (1) |(2) | 1. | 2.(全角点)| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、 - pattern = r'^\s*(?:[((]\d+[)))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]?|[一二三四五六七八九十]+、)' + pattern = r'^\s*(?:[((]\d+[)))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]+|[一二三四五六七八九十]+、)' data = re.sub(pattern, '', data).strip() keyword_match = re.search(keywords, data) if keyword_match: @@ -355,7 +355,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达 # print("*********") new_text_list = preprocess_text_list(text_list) # 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。 - pattern = r'^\s*(?:[((]\d+[)))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]?|[一二三四五六七八九十]+、)' + pattern = r'^\s*(?:[((]\d+[)))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]+|[一二三四五六七八九十]+、)' data = re.sub(pattern, '', new_text_list[0]).strip() # 去除序号 # 将修改后的第一个元素和剩余的元素连接起来 @@ -632,6 +632,7 @@ def combine_find_invalid(invalid_docpath, output_dir): print("无效标与废标done...") return {"无效标与废标项": combined_dict} +#TODO:无效标这里提示词修改一下 if __name__ == '__main__': start_time = time.time() # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json" @@ -641,9 +642,10 @@ if __name__ == '__main__': # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx' pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf' - output_dir = r"C:\Users\Administrator\Desktop\new招标文件\tmp\new招标文件\tmp" - invalid_added=insert_mark(pdf_path) - invalid_added_docx=pdf2docx(invalid_added) + output_dir = r"C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\tmp" + # invalid_added=insert_mark(pdf_path) + # invalid_added_docx=pdf2docx(invalid_added) + invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\invalid_added.docx' results = combine_find_invalid(invalid_added_docx, output_dir) end_time = time.time() print("Results:", json.dumps(results, ensure_ascii=False, indent=4)) diff --git a/flask_app/货物标/商务服务其他要求提取.py b/flask_app/货物标/商务服务其他要求提取.py index 9c1f574..fe8755c 100644 --- a/flask_app/货物标/商务服务其他要求提取.py +++ b/flask_app/货物标/商务服务其他要求提取.py @@ -300,7 +300,7 @@ def get_business_requirements(procurement_path, processed_filepath, model_type): if busi_user_query: if model_type: # 如果是模型调用,直接使用 doubao_model - future = executor.submit(doubao_model, busi_user_query) + future = executor.submit(doubao_model, busi_user_query,True) else: # 使用 qianwen_long_stream 并传入 file_id future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1,True) @@ -310,7 +310,7 @@ def get_business_requirements(procurement_path, processed_filepath, model_type): if tech_user_query: if model_type: # 如果是模型调用,直接使用 doubao_model - future = executor.submit(doubao_model, tech_user_query) + future = executor.submit(doubao_model, tech_user_query,True) else: # 使用 qianwen_long_stream 并传入 file_id future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1,True) @@ -348,11 +348,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type): if __name__ == "__main__": # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx" # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件(广水市教育局封闭管理)_procurement.pdf" - procurement_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf' + procurement_path=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\ztbfile_procurement.pdf' docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx' # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf" # file_id = upload_file(truncate_file) # processed_filepath = pdf2txt(procurement_path) - processed_filepath=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\extract1.txt' + processed_filepath=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\extract1.txt' final_res= get_business_requirements(procurement_path,processed_filepath,1) print(json.dumps(final_res, ensure_ascii=False, indent=4))