2.17 增加读文件pdf接口测试1
This commit is contained in:
parent
f6da90d230
commit
3bf84dbdc4
@ -65,7 +65,7 @@ def convert_file_to_markdown(file_path, file_name="extract1.txt"):
|
||||
# 发送OCR请求,将PDF转换为Markdown
|
||||
resp = textin.recognize_pdf2md(image, {
|
||||
'page_start': 0,
|
||||
'page_count': 100, # 设置解析页数为100页
|
||||
'page_count': 80, # 设置解析页数为最高为80页
|
||||
'table_flavor': 'html', # 按HTML语法输出表格
|
||||
'parse_mode': 'scan', # 设置解析模式为scan模式
|
||||
'page_details': 0, # 不包含页面细节
|
||||
|
@ -28,13 +28,13 @@ def process_file():
|
||||
# 生成唯一文件名
|
||||
filename = os.path.join(output_folder,'ztbfile.pdf')
|
||||
file_path,file_type=download_file(file_url, filename)
|
||||
# print(file_path)
|
||||
# 调用预处理函数
|
||||
result = read_pdf_main(pdf_path=file_path)
|
||||
|
||||
# 处理结果
|
||||
if not result:
|
||||
return jsonify({'error': 'File processing failed'})
|
||||
# # print(file_path)
|
||||
# # 调用预处理函数
|
||||
# result = read_pdf_main(pdf_path=file_path)
|
||||
#
|
||||
# # 处理结果
|
||||
# if not result:
|
||||
# return jsonify({'error': 'File processing failed'})
|
||||
response_data={
|
||||
"处理结果":"yes"
|
||||
}
|
||||
|
@ -550,7 +550,7 @@ def get_technical_requirements(invalid_path, processed_filepath, model_type=1):
|
||||
processed_data = truncate_system_keys(preprocessed_data) # 限制深度
|
||||
key_paths, grouped_paths, good_list, data_copy = generate_key_paths(
|
||||
processed_data) # 提取需要采购的货物清单 key_list:交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' ,输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
|
||||
# if len(good_list)>100 and model_type==1: #并发特别高(len(good_list)),tokens会比较贵,以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens long:0.0005/ktokens 差不多价格,暂不考虑
|
||||
# if len(good_list)>100 and model_type==1: #并发特别高(len(good_list)),目前是对每个货物都开一个线程获取结果,对于较多的货物,tokens会比较贵,以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens long:0.0005/ktokens 差不多价格,暂不考虑
|
||||
# model_type=2
|
||||
# file_id=upload_file(processed_filepath)
|
||||
modified_data = rename_keys(data_copy)
|
||||
|
@ -26,22 +26,23 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
|
||||
try:
|
||||
proc_path = os.path.abspath(procurement_path)
|
||||
invalid_path = os.path.abspath(invalid_path)
|
||||
# 判断路径是否一致,一致表示一开始procurement_path截取为空
|
||||
if proc_path == invalid_path:
|
||||
# 读取 PDF 页码数
|
||||
page_count = get_pdf_page_count(procurement_path) #注意这里的procurement_path可能是docx的invalid_path
|
||||
if page_count > 60: # 如果页码数大于60,不转markdown
|
||||
tech_model_type= 2 #long
|
||||
busi_model_type=3 #long-stream
|
||||
processed_filepath = ""
|
||||
else:
|
||||
tech_model_type= 1 #doubao或者qianwen-plus
|
||||
busi_model_type =4 #qianwen-plus
|
||||
processed_filepath = convert_file_to_markdown(procurement_path,"extract3.txt") # invalid_path->markdown格式
|
||||
page_count = get_pdf_page_count(procurement_path)
|
||||
|
||||
if page_count > 60: #转换过多,消耗的tokens 以及 文档转换的费用过高!退而求其次用qianwen-long
|
||||
tech_model_type = 2 # long
|
||||
busi_model_type = 3 # long-stream
|
||||
processed_filepath = ""
|
||||
# 如果procurement_path截取成功,则更新invalid_path
|
||||
if proc_path != invalid_path:
|
||||
invalid_path = proc_path
|
||||
else:
|
||||
tech_model_type = 1 #doubao或者qianwen-plus
|
||||
busi_model_type = 4
|
||||
processed_filepath = convert_file_to_markdown(procurement_path) # 正常情况:procurement_path->markdown格式
|
||||
tech_model_type = 1 # doubao或者qianwen-plus
|
||||
busi_model_type = 4 # qianwen-plus
|
||||
# 根据是否截取成功调用不同参数
|
||||
if proc_path == invalid_path:
|
||||
processed_filepath = convert_file_to_markdown(procurement_path, "extract3.txt")
|
||||
else:
|
||||
processed_filepath = convert_file_to_markdown(procurement_path)
|
||||
# processed_filepath = pdf2txt(procurement_path) # 纯文本提取
|
||||
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
|
Loading…
x
Reference in New Issue
Block a user