zbparse/flask_app/general/file2markdown.py

import os.path
import requests
import json

def get_file_content(filePath):
    with open(filePath, 'rb') as fp:
        return fp.read()

class TextinOcr(object):
    def __init__(self, app_id, app_secret):
        self._app_id = app_id
        self._app_secret = app_secret
        self.host = 'https://api.textin.com'

    def recognize_pdf2md(self, image, options):
        """
        pdf to markdown
        :param options: request params
        :param image: file bytes
        :return: response

        options = {
            'pdf_pwd': None,  #当pdf为加密文档时，需要提供密码。 备注：对前端封装该接口时，需要自行对密码进行安全防护
            'dpi': 144,  # 设置dpi为144
            'page_start': 0,   #当上传的是pdf时，page_start 表示从第几页开始转
            'page_count': 1000,  # 设置解析的页数为1000页
            'apply_document_tree': 1,   #是否生成标题，默认为1，生成标题
            'markdown_details': 1,
            'page_details': 0,  # 不包含页面细节信息
            'table_flavor': 'md',   #按md语法输出表格
            'get_image': 'none',   #获取markdown里的图片，默认为none，不返回任何图像
            'parse_mode': 'scan',  # PDF解析模式，默认为scan模式，仅按文字识别方式处理。
        }
        """
        url = self.host + '/ai/service/v1/pdf_to_markdown'
        headers = {
            'x-ti-app-id': self._app_id,
            'x-ti-secret-code': self._app_secret
        }

        return requests.post(url, data=image, headers=headers, params=options)
#调用textIn：pdf/word->markdown
def convert_file_to_markdown(file_path, file_name="extract1.txt"):
    # 获取文件的绝对路径所在的文件夹
    output_folder = os.path.dirname(os.path.abspath(file_path))

    # 定义输出文件的完整路径
    output_file = os.path.join(output_folder, file_name)

    # 检查输出文件是否已经存在
    if os.path.exists(output_file):
        print(f"文件 '{output_file}' 已存在，直接返回路径。")
        return output_file  # 文件存在，直接返回路径

    # 如果文件不存在，则继续执行OCR请求
    app_id = os.getenv("TEXTIN_APP_ID")
    app_key = os.getenv("TEXTIN_APP_KEY")

    # 初始化TextinOcr对象
    textin = TextinOcr(app_id, app_key)

    # 读取文件内容
    image = get_file_content(file_path)

    # 发送OCR请求，将PDF转换为Markdown
    resp = textin.recognize_pdf2md(image, {
        'page_start': 0,
        'page_count': 100,  # 设置解析页数为100页
        'table_flavor': 'html',  # 按HTML语法输出表格
        'parse_mode': 'scan',  # 设置解析模式为scan模式
        'page_details': 0,  # 不包含页面细节
        'markdown_details': 1,
        'apply_document_tree': 1,
        'dpi': 216,  # 分辨率设置为216 dpi
        'get_image': 'none'
    })

    # 打印请求耗时（网络请求时间）
    print("Request network time: ", resp.elapsed.total_seconds(), "seconds")

    # 解析响应内容
    data = json.loads(resp.text)
    duration = data.get('duration',"错误！")  # 提取 'duration'
    print("实际转换时间(ms):", duration)
    # 检查响应中是否包含Markdown内容
    if 'result' in data and 'markdown' in data['result']:
        markdown_content = data['result']['markdown']

        try:
            # 将Markdown内容写入文件，使用UTF-8编码
            with open(output_file, 'w', encoding='utf-8') as file:
                file.write(markdown_content)
            print(f"Markdown 内容已成功保存到 '{output_file}'")
            return output_file  # 返回输出文件的路径
        except IOError as e:
            print(f"写入文件时出错: {e}")
            return None  # 如果写入失败，返回None
    else:
        print("未能从响应中获取Markdown内容。")
        return None  # 如果响应中没有Markdown内容，返回None


if __name__ == "__main__":
    # file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf"
    file_path=r"C:\Users\Administrator\Desktop\招标文件\tmp_pdf_wo_pages_1.pdf"
    res=convert_file_to_markdown(file_path)
    print(res)