12.20 豆包大模型bug解决
This commit is contained in:
parent
de254d3c29
commit
008df2fc4a
@ -325,7 +325,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达
|
||||
if any(exclude in data for exclude in excludes):
|
||||
continue # 如果包含任何排除字符串,跳过这个数据
|
||||
# 去掉开头的序号,eg:1 | (1) |(2) | 1. | 2.(全角点)| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
|
||||
pattern = r'^\s*(?:[((]\d+[)))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]?|[一二三四五六七八九十]+、)'
|
||||
pattern = r'^\s*(?:[((]\d+[)))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]+|[一二三四五六七八九十]+、)'
|
||||
data = re.sub(pattern, '', data).strip()
|
||||
keyword_match = re.search(keywords, data)
|
||||
if keyword_match:
|
||||
@ -355,7 +355,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达
|
||||
# print("*********")
|
||||
new_text_list = preprocess_text_list(text_list)
|
||||
# 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
|
||||
pattern = r'^\s*(?:[((]\d+[)))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]?|[一二三四五六七八九十]+、)'
|
||||
pattern = r'^\s*(?:[((]\d+[)))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\)]+|[一二三四五六七八九十]+、)'
|
||||
|
||||
data = re.sub(pattern, '', new_text_list[0]).strip() # 去除序号
|
||||
# 将修改后的第一个元素和剩余的元素连接起来
|
||||
@ -632,6 +632,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
|
||||
print("无效标与废标done...")
|
||||
return {"无效标与废标项": combined_dict}
|
||||
|
||||
#TODO:无效标这里提示词修改一下
|
||||
if __name__ == '__main__':
|
||||
start_time = time.time()
|
||||
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
|
||||
@ -641,9 +642,10 @@ if __name__ == '__main__':
|
||||
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
|
||||
pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf'
|
||||
|
||||
output_dir = r"C:\Users\Administrator\Desktop\new招标文件\tmp\new招标文件\tmp"
|
||||
invalid_added=insert_mark(pdf_path)
|
||||
invalid_added_docx=pdf2docx(invalid_added)
|
||||
output_dir = r"C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\tmp"
|
||||
# invalid_added=insert_mark(pdf_path)
|
||||
# invalid_added_docx=pdf2docx(invalid_added)
|
||||
invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\invalid_added.docx'
|
||||
results = combine_find_invalid(invalid_added_docx, output_dir)
|
||||
end_time = time.time()
|
||||
print("Results:", json.dumps(results, ensure_ascii=False, indent=4))
|
||||
|
@ -300,7 +300,7 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
|
||||
if busi_user_query:
|
||||
if model_type:
|
||||
# 如果是模型调用,直接使用 doubao_model
|
||||
future = executor.submit(doubao_model, busi_user_query)
|
||||
future = executor.submit(doubao_model, busi_user_query,True)
|
||||
else:
|
||||
# 使用 qianwen_long_stream 并传入 file_id
|
||||
future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1,True)
|
||||
@ -310,7 +310,7 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
|
||||
if tech_user_query:
|
||||
if model_type:
|
||||
# 如果是模型调用,直接使用 doubao_model
|
||||
future = executor.submit(doubao_model, tech_user_query)
|
||||
future = executor.submit(doubao_model, tech_user_query,True)
|
||||
else:
|
||||
# 使用 qianwen_long_stream 并传入 file_id
|
||||
future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1,True)
|
||||
@ -348,11 +348,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
|
||||
if __name__ == "__main__":
|
||||
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
|
||||
# truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件(广水市教育局封闭管理)_procurement.pdf"
|
||||
procurement_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf'
|
||||
procurement_path=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\ztbfile_procurement.pdf'
|
||||
docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
|
||||
# truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
|
||||
# file_id = upload_file(truncate_file)
|
||||
# processed_filepath = pdf2txt(procurement_path)
|
||||
processed_filepath=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\extract1.txt'
|
||||
processed_filepath=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\extract1.txt'
|
||||
final_res= get_business_requirements(procurement_path,processed_filepath,1)
|
||||
print(json.dumps(final_res, ensure_ascii=False, indent=4))
|
||||
|
Loading…
x
Reference in New Issue
Block a user