12.20 豆包大模型bug解决

This commit is contained in:
zy123 2024-12-20 17:47:56 +08:00
parent de254d3c29
commit 008df2fc4a
2 changed files with 11 additions and 9 deletions

View File

@ -325,7 +325,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达
if any(exclude in data for exclude in excludes): if any(exclude in data for exclude in excludes):
continue # 如果包含任何排除字符串,跳过这个数据 continue # 如果包含任何排除字符串,跳过这个数据
# 去掉开头的序号eg:1 | (1) |2 | 1. | 2全角点| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、 # 去掉开头的序号eg:1 | (1) |2 | 1. | 2全角点| 3、 | 1.1 | 2.3.4 | A1 | C1.1 | 一、
pattern = r'^\s*(?:[(]\d+[))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\]?|[一二三四五六七八九十]+、)' pattern = r'^\s*(?:[(]\d+[))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\]+|[一二三四五六七八九十]+、)'
data = re.sub(pattern, '', data).strip() data = re.sub(pattern, '', data).strip()
keyword_match = re.search(keywords, data) keyword_match = re.search(keywords, data)
if keyword_match: if keyword_match:
@ -355,7 +355,7 @@ def clean_dict_datas(extracted_contents, keywords, excludes): # 让正则表达
# print("*********") # print("*********")
new_text_list = preprocess_text_list(text_list) new_text_list = preprocess_text_list(text_list)
# 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。 # 用于处理结构化文本,清理掉不必要的序号,并将分割后的段落合并,最终形成更简洁和格式化的输出。
pattern = r'^\s*(?:[(]\d+[))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\]?|[一二三四五六七八九十]+、)' pattern = r'^\s*(?:[(]\d+[))]|[A-Za-z]?\d+(?:\.\s*\d+)*[\s\.、.)\]+|[一二三四五六七八九十]+、)'
data = re.sub(pattern, '', new_text_list[0]).strip() # 去除序号 data = re.sub(pattern, '', new_text_list[0]).strip() # 去除序号
# 将修改后的第一个元素和剩余的元素连接起来 # 将修改后的第一个元素和剩余的元素连接起来
@ -632,6 +632,7 @@ def combine_find_invalid(invalid_docpath, output_dir):
print("无效标与废标done...") print("无效标与废标done...")
return {"无效标与废标项": combined_dict} return {"无效标与废标项": combined_dict}
#TODO:无效标这里提示词修改一下
if __name__ == '__main__': if __name__ == '__main__':
start_time = time.time() start_time = time.time()
# truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json" # truncate_json_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output4\\tmp2\\竞争性谈判文件(3)_tobidders_notice_part1\\truncate_output.json"
@ -641,9 +642,10 @@ if __name__ == '__main__':
# doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx' # doc_path = r'C:\Users\Administrator\Desktop\new招标文件\tmp\2024-贵州-贵州省罗甸县 2024 年度广州市协作资金龙坪镇、边阳镇产业路硬化建设项目.docx'
pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf' pdf_path=r'C:\Users\Administrator\Desktop\fsdownload\8a9ebd69-af0d-4661-a8ce-78136cb6bc4f\ztbfile.pdf'
output_dir = r"C:\Users\Administrator\Desktop\new招标文件\tmp\new招标文件\tmp" output_dir = r"C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\tmp"
invalid_added=insert_mark(pdf_path) # invalid_added=insert_mark(pdf_path)
invalid_added_docx=pdf2docx(invalid_added) # invalid_added_docx=pdf2docx(invalid_added)
invalid_added_docx=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\invalid_added.docx'
results = combine_find_invalid(invalid_added_docx, output_dir) results = combine_find_invalid(invalid_added_docx, output_dir)
end_time = time.time() end_time = time.time()
print("Results:", json.dumps(results, ensure_ascii=False, indent=4)) print("Results:", json.dumps(results, ensure_ascii=False, indent=4))

View File

@ -300,7 +300,7 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
if busi_user_query: if busi_user_query:
if model_type: if model_type:
# 如果是模型调用,直接使用 doubao_model # 如果是模型调用,直接使用 doubao_model
future = executor.submit(doubao_model, busi_user_query) future = executor.submit(doubao_model, busi_user_query,True)
else: else:
# 使用 qianwen_long_stream 并传入 file_id # 使用 qianwen_long_stream 并传入 file_id
future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1,True) future = executor.submit(qianwen_long_stream, file_id, busi_user_query, 2, 1,True)
@ -310,7 +310,7 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
if tech_user_query: if tech_user_query:
if model_type: if model_type:
# 如果是模型调用,直接使用 doubao_model # 如果是模型调用,直接使用 doubao_model
future = executor.submit(doubao_model, tech_user_query) future = executor.submit(doubao_model, tech_user_query,True)
else: else:
# 使用 qianwen_long_stream 并传入 file_id # 使用 qianwen_long_stream 并传入 file_id
future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1,True) future = executor.submit(qianwen_long_stream, file_id, tech_user_query, 2, 1,True)
@ -348,11 +348,11 @@ def get_business_requirements(procurement_path, processed_filepath, model_type):
if __name__ == "__main__": if __name__ == "__main__":
# truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx" # truncate_file = "C:\\Users\\Administrator\\Desktop\\fsdownload\\e4be098d-b378-4126-9c32-a742b237b3b1\\ztbfile_procurement.docx"
# truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件广水市教育局封闭管理_procurement.pdf" # truncate_file = r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件广水市教育局封闭管理_procurement.pdf"
procurement_path=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\ztbfile_procurement.pdf' procurement_path=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\ztbfile_procurement.pdf'
docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx' docx_path=r'D:\flask_project\flask_app\static\output\output1\83ae3e35-9136-4402-a74f-01d7adfcbb73\invalid_added.docx'
# truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf" # truncate_file=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0519-001-招标文件_procurement.pdf"
# file_id = upload_file(truncate_file) # file_id = upload_file(truncate_file)
# processed_filepath = pdf2txt(procurement_path) # processed_filepath = pdf2txt(procurement_path)
processed_filepath=r'C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\be901ea0-adc9-47b8-9ada-5c3bc0dd9434\extract1.txt' processed_filepath=r'C:\Users\Administrator\Desktop\fsdownload\edccf32d-84ed-453d-b4ec-ef912c6786b0\extract1.txt'
final_res= get_business_requirements(procurement_path,processed_filepath,1) final_res= get_business_requirements(procurement_path,processed_filepath,1)
print(json.dumps(final_res, ensure_ascii=False, indent=4)) print(json.dumps(final_res, ensure_ascii=False, indent=4))