12.6 扫描件pdf的处理

2024-12-06 09:25:24 +08:00 · 2024-12-06 09:25:24 +08:00 · e0ea02544d
commit e0ea02544d
parent 6bb7d334d9
3 changed files with 32 additions and 10 deletions
--- a/flask_app/general/clean_pdf.py
+++ b/flask_app/general/clean_pdf.py
@ -80,13 +80,31 @@ def clean_page_content(text, common_header):
    text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text)  # 删除形如 '—2—' 或 '-2-' 的页码
    return text

-def is_scanned_pdf(file_path):
+
+from PyPDF2 import PdfReader
+
+
+def is_scanned_pdf(file_path, max_pages=15):
+    """
+    检查 PDF 是否为扫描件（即前 15 页无文本）。
+
+    参数:
+    - file_path: PDF 文件路径。
+    - max_pages: 最大检查页数，默认为 15 页。
+
+    返回:
+    - True: 如果前 15 页都没有文本，认为是扫描件。
+    - False: 如果有任何页有文本，认为不是扫描件。
+    """
    with open(file_path, 'rb') as file:
        reader = PdfReader(file)
-        for page in reader.pages:
+        for i, page in enumerate(reader.pages):
+            if i >= max_pages:  # 超过最大检查页数，停止检查
+                break
            if page.extract_text().strip():  # 如果有文本
                return False  # 不是扫描型
-    return True  # 全部页面都没有文本
+    return True  # 前 max_pages 页都没有文本
+

 if __name__ == '__main__':
    file_path = r"C:\Users\Administrator\Documents\WeChat Files\wxid_d11awe5rp1y722\FileStorage\File\2024-12\2020-安徽-安徽省生态环境厅电梯采购.pdf"
--- a/flask_app/general/通义千问long.py
+++ b/flask_app/general/通义千问long.py
@ -106,12 +106,13 @@ def qianwen_long(file_id, user_query, max_retries=2, backoff_factor=1.0):
                    print(f"查询 '{user_query}' 的所有 {max_retries + 1} 次尝试均失败（429 错误）。")
                    break
            elif error_code == 400 and error_code_string in ['data_inspection_failed', 'ResponseTimeout','DataInspectionFailed','response_timeout','request_timeout',"RequestTimeOut"]:
-                if attempt == 1:  # 只重试一次
-                    print(f"错误代码为 400 - {error_code_string}，将立即重试...")
-                    continue  # 直接跳到下一次循环（即重试一次）
-                else:
-                    print(f"查询 '{user_query}' 的所有 {max_retries + 1} 次尝试均失败（400 - {error_code_string}）。")
-                    break
+                print(f"错误代码为 400 - {error_code_string}，将调用 qianwen_long_stream 执行一次...")
+                try:
+                    # 超时就调用 qianwen_long_stream
+                    return qianwen_long_stream(file_id, user_query, max_retries=0)  # 禁用内部重试
+                except Exception as stream_exc:
+                    print(f"调用 qianwen_long_stream 时出错：{stream_exc}")
+                    break  # 跳出循环，不再重试
            else:
                # 对于非 429 和非特定 400 错误，不进行重试，直接抛出异常
                print(f"遇到非 429 或非 'data_inspection_failed' 的 400 错误（错误代码：{error_code}），不进行重试。")
@ -217,6 +218,7 @@ def qianwen_long_stream(file_id, user_query, max_retries = 2, backoff_factor = 1
                    time.sleep(sleep_time)
                else:
                    print(f"查询 '{user_query}' 的所有 {max_retries + 1} 次尝试均失败（429 错误）。")
+                    break
            elif error_code == 400 and error_code_string in ['data_inspection_failed', 'ResponseTimeout',
                                                             'DataInspectionFailed', 'response_timeout','RequestTimeOut','request_timeout']:
                if attempt == 1:  # 只重试一次
@ -224,9 +226,11 @@ def qianwen_long_stream(file_id, user_query, max_retries = 2, backoff_factor = 1
                    continue  # 直接跳到下一次循环（即重试一次）
                else:
                    print(f"查询 '{user_query}' 的所有 {max_retries + 1} 次尝试均失败（400 - {error_code_string}）。")
+                    break
            else:
                # 对于非 429 和非特定 400 错误，不进行重试，直接抛出异常
                print(f"遇到非 429 或非 'data_inspection_failed...' 的 400 错误（错误代码：{error_code}），不进行重试。")
+                break

    # 如果所有尝试都失败了，返回空字符串
    return ""
--- a/flask_app/routes/货物标解析main.py
+++ b/flask_app/routes/货物标解析main.py
@ -274,7 +274,7 @@ def goods_bid_main(output_folder, file_path, file_type, unique_id):
 #TODO:考虑把解析失败的调用豆包，全文上传。

 #TODO:重置一下投标文件格式提取那部分的代码
-#TODO:qianwenlong重试那边，改流式
+
 #TODO:小解析考虑提速：1：直接pdf转文本，再切分  2.多线程读取每页是否有图片
 #商务标这里改为列表最里层
 #good_list 金额  截取上下文