From 8a5b3616732e9dc75e0453c65dd8c032a421a212 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Mon, 30 Dec 2024 10:27:29 +0800
Subject: [PATCH] =?UTF-8?q?12.30=20=E5=BA=9F=E6=A0=87=E9=A1=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/file2markdown.py | 50 +++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/flask_app/general/file2markdown.py b/flask_app/general/file2markdown.py
index ce4e9a7..6be096b 100644
--- a/flask_app/general/file2markdown.py
+++ b/flask_app/general/file2markdown.py
@@ -40,37 +40,63 @@ class TextinOcr(object):
         }
 
         return requests.post(url, data=image, headers=headers, params=options)
-def convert_file_to_markdown(file_path):
-    output_folder=os.path.dirname(os.path.abspath(file_path))
-    app_id=os.getenv("TEXTIN_APP_ID")
-    app_key=os.getenv("TEXTIN_APP_KEY")
+def convert_file_to_markdown(file_path, file_name="extract1.txt"):
+    # 获取文件的绝对路径所在的文件夹
+    output_folder = os.path.dirname(os.path.abspath(file_path))
+
+    # 定义输出文件的完整路径
+    output_file = os.path.join(output_folder, file_name)
+
+    # 检查输出文件是否已经存在
+    if os.path.exists(output_file):
+        print(f"文件 '{output_file}' 已存在，直接返回路径。")
+        return output_file  # 文件存在，直接返回路径
+
+    # 如果文件不存在，则继续执行OCR请求
+    app_id = os.getenv("TEXTIN_APP_ID")
+    app_key = os.getenv("TEXTIN_APP_KEY")
+
+    # 初始化TextinOcr对象
     textin = TextinOcr(app_id, app_key)
+
+    # 读取文件内容
     image = get_file_content(file_path)
+
+    # 发送OCR请求，将PDF转换为Markdown
     resp = textin.recognize_pdf2md(image, {
         'page_start': 0,
-        'page_count': 100,  # 设置解析页数为50页
-        'table_flavor': 'html',  # html 按html语法输出表格
+        'page_count': 100,  # 设置解析页数为100页
+        'table_flavor': 'html',  # 按HTML语法输出表格
         'parse_mode': 'scan',  # 设置解析模式为scan模式
         'page_details': 0,  # 不包含页面细节
         'markdown_details': 1,
         'apply_document_tree': 1,
-        'dpi': 216,  # 分辨率设置默认为144 dpi,
-        'get_image':'none'
+        'dpi': 216,  # 分辨率设置为216 dpi
+        'get_image': 'none'
     })
+
+    # 打印请求耗时
     print("request time: ", resp.elapsed.total_seconds())
+
+    # 解析响应内容
     data = json.loads(resp.text)
+
+    # 检查响应中是否包含Markdown内容
     if 'result' in data and 'markdown' in data['result']:
         markdown_content = data['result']['markdown']
-        # 定义输出文件名
-        output_file = os.path.join(output_folder,"extract1.txt")
+
         try:
-            # 将 markdown 内容写入文件，使用 UTF-8 编码
+            # 将Markdown内容写入文件，使用UTF-8编码
             with open(output_file, 'w', encoding='utf-8') as file:
                 file.write(markdown_content)
             print(f"Markdown 内容已成功保存到 '{output_file}'")
-            return output_file
+            return output_file  # 返回输出文件的路径
         except IOError as e:
             print(f"写入文件时出错: {e}")
+            return None  # 如果写入失败，返回None
+    else:
+        print("未能从响应中获取Markdown内容。")
+        return None  # 如果响应中没有Markdown内容，返回None
 
 
 if __name__ == "__main__":