diff --git a/flask_app/general/file2markdown.py b/flask_app/general/file2markdown.py index 1810efc..fd2a3c9 100644 --- a/flask_app/general/file2markdown.py +++ b/flask_app/general/file2markdown.py @@ -65,7 +65,7 @@ def convert_file_to_markdown(file_path, file_name="extract1.txt"): # 发送OCR请求,将PDF转换为Markdown resp = textin.recognize_pdf2md(image, { 'page_start': 0, - 'page_count': 100, # 设置解析页数为100页 + 'page_count': 80, # 设置解析页数为最高为80页 'table_flavor': 'html', # 按HTML语法输出表格 'parse_mode': 'scan', # 设置解析模式为scan模式 'page_details': 0, # 不包含页面细节 diff --git a/flask_app/routes/test_readpdf.py b/flask_app/routes/test_readpdf.py index 4110bd7..42ed689 100644 --- a/flask_app/routes/test_readpdf.py +++ b/flask_app/routes/test_readpdf.py @@ -28,13 +28,13 @@ def process_file(): # 生成唯一文件名 filename = os.path.join(output_folder,'ztbfile.pdf') file_path,file_type=download_file(file_url, filename) - # print(file_path) - # 调用预处理函数 - result = read_pdf_main(pdf_path=file_path) - - # 处理结果 - if not result: - return jsonify({'error': 'File processing failed'}) + # # print(file_path) + # # 调用预处理函数 + # result = read_pdf_main(pdf_path=file_path) + # + # # 处理结果 + # if not result: + # return jsonify({'error': 'File processing failed'}) response_data={ "处理结果":"yes" } diff --git a/flask_app/货物标/技术参数要求提取.py b/flask_app/货物标/技术参数要求提取.py index 4933eb4..9a682a1 100644 --- a/flask_app/货物标/技术参数要求提取.py +++ b/flask_app/货物标/技术参数要求提取.py @@ -550,7 +550,7 @@ def get_technical_requirements(invalid_path, processed_filepath, model_type=1): processed_data = truncate_system_keys(preprocessed_data) # 限制深度 key_paths, grouped_paths, good_list, data_copy = generate_key_paths( processed_data) # 提取需要采购的货物清单 key_list:交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2',提取'交换机' ,输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'} - # if len(good_list)>100 and model_type==1: #并发特别高(len(good_list)),tokens会比较贵,以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens long:0.0005/ktokens 差不多价格,暂不考虑 + # if len(good_list)>100 and model_type==1: #并发特别高(len(good_list)),目前是对每个货物都开一个线程获取结果,对于较多的货物,tokens会比较贵,以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens long:0.0005/ktokens 差不多价格,暂不考虑 # model_type=2 # file_id=upload_file(processed_filepath) modified_data = rename_keys(data_copy) diff --git a/flask_app/货物标/提取采购需求main.py b/flask_app/货物标/提取采购需求main.py index a3a87fe..b6d8ef9 100644 --- a/flask_app/货物标/提取采购需求main.py +++ b/flask_app/货物标/提取采购需求main.py @@ -26,22 +26,23 @@ def fetch_procurement_reqs(procurement_path, invalid_path): try: proc_path = os.path.abspath(procurement_path) invalid_path = os.path.abspath(invalid_path) - # 判断路径是否一致,一致表示一开始procurement_path截取为空 - if proc_path == invalid_path: - # 读取 PDF 页码数 - page_count = get_pdf_page_count(procurement_path) #注意这里的procurement_path可能是docx的invalid_path - if page_count > 60: # 如果页码数大于60,不转markdown - tech_model_type= 2 #long - busi_model_type=3 #long-stream - processed_filepath = "" - else: - tech_model_type= 1 #doubao或者qianwen-plus - busi_model_type =4 #qianwen-plus - processed_filepath = convert_file_to_markdown(procurement_path,"extract3.txt") # invalid_path->markdown格式 + page_count = get_pdf_page_count(procurement_path) + + if page_count > 60: #转换过多,消耗的tokens 以及 文档转换的费用过高!退而求其次用qianwen-long + tech_model_type = 2 # long + busi_model_type = 3 # long-stream + processed_filepath = "" + # 如果procurement_path截取成功,则更新invalid_path + if proc_path != invalid_path: + invalid_path = proc_path else: - tech_model_type = 1 #doubao或者qianwen-plus - busi_model_type = 4 - processed_filepath = convert_file_to_markdown(procurement_path) # 正常情况:procurement_path->markdown格式 + tech_model_type = 1 # doubao或者qianwen-plus + busi_model_type = 4 # qianwen-plus + # 根据是否截取成功调用不同参数 + if proc_path == invalid_path: + processed_filepath = convert_file_to_markdown(procurement_path, "extract3.txt") + else: + processed_filepath = convert_file_to_markdown(procurement_path) # processed_filepath = pdf2txt(procurement_path) # 纯文本提取 # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements with concurrent.futures.ThreadPoolExecutor() as executor: