2.17 增加读文件pdf接口测试1

This commit is contained in:
zy123 2025-02-17 14:26:11 +08:00
parent f6da90d230
commit 3bf84dbdc4
4 changed files with 25 additions and 24 deletions

View File

@ -65,7 +65,7 @@ def convert_file_to_markdown(file_path, file_name="extract1.txt"):
# 发送OCR请求将PDF转换为Markdown
resp = textin.recognize_pdf2md(image, {
'page_start': 0,
'page_count': 100, # 设置解析页数为100页
'page_count': 80, # 设置解析页数为最高为80页
'table_flavor': 'html', # 按HTML语法输出表格
'parse_mode': 'scan', # 设置解析模式为scan模式
'page_details': 0, # 不包含页面细节

View File

@ -28,13 +28,13 @@ def process_file():
# 生成唯一文件名
filename = os.path.join(output_folder,'ztbfile.pdf')
file_path,file_type=download_file(file_url, filename)
# print(file_path)
# 调用预处理函数
result = read_pdf_main(pdf_path=file_path)
# 处理结果
if not result:
return jsonify({'error': 'File processing failed'})
# # print(file_path)
# # 调用预处理函数
# result = read_pdf_main(pdf_path=file_path)
#
# # 处理结果
# if not result:
# return jsonify({'error': 'File processing failed'})
response_data={
"处理结果":"yes"
}

View File

@ -550,7 +550,7 @@ def get_technical_requirements(invalid_path, processed_filepath, model_type=1):
processed_data = truncate_system_keys(preprocessed_data) # 限制深度
key_paths, grouped_paths, good_list, data_copy = generate_key_paths(
processed_data) # 提取需要采购的货物清单 key_list交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' 输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
# if len(good_list)>100 and model_type==1: #并发特别高len(good_list)tokens会比较贵以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens long:0.0005/ktokens 差不多价格,暂不考虑
# if len(good_list)>100 and model_type==1: #并发特别高len(good_list)目前是对每个货物都开一个线程获取结果,对于较多的货物,tokens会比较贵以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens long:0.0005/ktokens 差不多价格,暂不考虑
# model_type=2
# file_id=upload_file(processed_filepath)
modified_data = rename_keys(data_copy)

View File

@ -26,22 +26,23 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
try:
proc_path = os.path.abspath(procurement_path)
invalid_path = os.path.abspath(invalid_path)
# 判断路径是否一致一致表示一开始procurement_path截取为空
if proc_path == invalid_path:
# 读取 PDF 页码数
page_count = get_pdf_page_count(procurement_path) #注意这里的procurement_path可能是docx的invalid_path
if page_count > 60: # 如果页码数大于60,不转markdown
tech_model_type= 2 #long
busi_model_type=3 #long-stream
processed_filepath = ""
else:
tech_model_type= 1 #doubao或者qianwen-plus
busi_model_type =4 #qianwen-plus
processed_filepath = convert_file_to_markdown(procurement_path,"extract3.txt") # invalid_path->markdown格式
page_count = get_pdf_page_count(procurement_path)
if page_count > 60: #转换过多消耗的tokens 以及 文档转换的费用过高退而求其次用qianwen-long
tech_model_type = 2 # long
busi_model_type = 3 # long-stream
processed_filepath = ""
# 如果procurement_path截取成功则更新invalid_path
if proc_path != invalid_path:
invalid_path = proc_path
else:
tech_model_type = 1 #doubao或者qianwen-plus
busi_model_type = 4
processed_filepath = convert_file_to_markdown(procurement_path) # 正常情况procurement_path->markdown格式
tech_model_type = 1 # doubao或者qianwen-plus
busi_model_type = 4 # qianwen-plus
# 根据是否截取成功调用不同参数
if proc_path == invalid_path:
processed_filepath = convert_file_to_markdown(procurement_path, "extract3.txt")
else:
processed_filepath = convert_file_to_markdown(procurement_path)
# processed_filepath = pdf2txt(procurement_path) # 纯文本提取
# 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
with concurrent.futures.ThreadPoolExecutor() as executor: