2.17 增加读文件pdf接口测试1

2025-02-17 14:26:11 +08:00 · 2025-02-17 14:26:11 +08:00 · 3bf84dbdc4
commit 3bf84dbdc4
parent f6da90d230
4 changed files with 25 additions and 24 deletions
--- a/flask_app/general/file2markdown.py
+++ b/flask_app/general/file2markdown.py
@ -65,7 +65,7 @@ def convert_file_to_markdown(file_path, file_name="extract1.txt"):
    # 发送OCR请求，将PDF转换为Markdown
    resp = textin.recognize_pdf2md(image, {
        'page_start': 0,
-        'page_count': 100,  # 设置解析页数为100页
+        'page_count': 80,  # 设置解析页数为最高为80页
        'table_flavor': 'html',  # 按HTML语法输出表格
        'parse_mode': 'scan',  # 设置解析模式为scan模式
        'page_details': 0,  # 不包含页面细节
--- a/flask_app/routes/test_readpdf.py
+++ b/flask_app/routes/test_readpdf.py
@ -28,13 +28,13 @@ def process_file():
        # 生成唯一文件名
        filename = os.path.join(output_folder,'ztbfile.pdf')
        file_path,file_type=download_file(file_url, filename)
-        # print(file_path)
-        # 调用预处理函数
-        result = read_pdf_main(pdf_path=file_path)
-
-        # 处理结果
-        if not result:
-            return jsonify({'error': 'File processing failed'})
+        # # print(file_path)
+        # # 调用预处理函数
+        # result = read_pdf_main(pdf_path=file_path)
+        #
+        # # 处理结果
+        # if not result:
+        #     return jsonify({'error': 'File processing failed'})
        response_data={
            "处理结果":"yes"
        }
--- a/flask_app/货物标/技术参数要求提取.py
+++ b/flask_app/货物标/技术参数要求提取.py
@ -550,7 +550,7 @@ def get_technical_requirements(invalid_path, processed_filepath, model_type=1):
    processed_data = truncate_system_keys(preprocessed_data)  # 限制深度
    key_paths, grouped_paths, good_list, data_copy = generate_key_paths(
        processed_data)  # 提取需要采购的货物清单 key_list：交通监控视频子系统.高清视频抓拍像机 ... grouped_paths是同一系统下同时有'交换机-1'和'交换机-2'，提取'交换机'  ，输出eg:{'交通标志.标志牌铝板', '交通信号灯.交换机'}
-    # if len(good_list)>100 and model_type==1:       #并发特别高（len(good_list)），tokens会比较贵，以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens  long:0.0005/ktokens 差不多价格，暂不考虑
+    # if len(good_list)>100 and model_type==1:       #并发特别高（len(good_list)），目前是对每个货物都开一个线程获取结果，对于较多的货物，tokens会比较贵，以后可以考虑qianwen-long, 目前qianwen-plus:0.0008/ktokens  long:0.0005/ktokens 差不多价格，暂不考虑
    #     model_type=2
    #     file_id=upload_file(processed_filepath)
    modified_data = rename_keys(data_copy)
--- a/flask_app/货物标/提取采购需求main.py
+++ b/flask_app/货物标/提取采购需求main.py
@ -26,22 +26,23 @@ def fetch_procurement_reqs(procurement_path, invalid_path):
    try:
        proc_path = os.path.abspath(procurement_path)
        invalid_path = os.path.abspath(invalid_path)
-        # 判断路径是否一致，一致表示一开始procurement_path截取为空
-        if proc_path == invalid_path:
-            # 读取 PDF 页码数
-            page_count = get_pdf_page_count(procurement_path)     #注意这里的procurement_path可能是docx的invalid_path
-            if page_count > 60:  # 如果页码数大于60,不转markdown
-                tech_model_type= 2    #long
-                busi_model_type=3    #long-stream
-                processed_filepath = ""
-            else:
-                tech_model_type= 1    #doubao或者qianwen-plus
-                busi_model_type =4   #qianwen-plus
-                processed_filepath = convert_file_to_markdown(procurement_path,"extract3.txt")  # invalid_path->markdown格式
+        page_count = get_pdf_page_count(procurement_path)
+
+        if page_count > 60:    #转换过多，消耗的tokens 以及 文档转换的费用过高！退而求其次用qianwen-long
+            tech_model_type = 2  # long
+            busi_model_type = 3  # long-stream
+            processed_filepath = ""
+            # 如果procurement_path截取成功，则更新invalid_path
+            if proc_path != invalid_path:
+                invalid_path = proc_path
        else:
-            tech_model_type = 1     #doubao或者qianwen-plus
-            busi_model_type = 4
-            processed_filepath = convert_file_to_markdown(procurement_path)  # 正常情况：procurement_path->markdown格式
+            tech_model_type = 1  # doubao或者qianwen-plus
+            busi_model_type = 4  # qianwen-plus
+            # 根据是否截取成功调用不同参数
+            if proc_path == invalid_path:
+                processed_filepath = convert_file_to_markdown(procurement_path, "extract3.txt")
+            else:
+                processed_filepath = convert_file_to_markdown(procurement_path)
        # processed_filepath = pdf2txt(procurement_path)  # 纯文本提取
        # 使用 ThreadPoolExecutor 并行处理 get_technical_requirements 和 get_business_requirements
        with concurrent.futures.ThreadPoolExecutor() as executor: