zbparse/flask_app/general/file2markdown.py
2024-12-03 09:07:14 +08:00

78 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os.path
import requests
import json
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
class TextinOcr(object):
def __init__(self, app_id, app_secret):
self._app_id = app_id
self._app_secret = app_secret
self.host = 'https://api.textin.com'
def recognize_pdf2md(self, image, options):
"""
pdf to markdown
:param options: request params
:param image: file bytes
:return: response
options = {
'pdf_pwd': None, #当pdf为加密文档时需要提供密码。 备注:对前端封装该接口时,需要自行对密码进行安全防护
'dpi': 144, # 设置dpi为144
'page_start': 0, #当上传的是pdf时page_start 表示从第几页开始转
'page_count': 1000, # 设置解析的页数为1000页
'apply_document_tree': 1, #是否生成标题默认为1生成标题
'markdown_details': 1,
'page_details': 0, # 不包含页面细节信息
'table_flavor': 'md', #按md语法输出表格
'get_image': 'none', #获取markdown里的图片默认为none不返回任何图像
'parse_mode': 'scan', # PDF解析模式默认为scan模式仅按文字识别方式处理。
}
"""
url = self.host + '/ai/service/v1/pdf_to_markdown'
headers = {
'x-ti-app-id': self._app_id,
'x-ti-secret-code': self._app_secret
}
return requests.post(url, data=image, headers=headers, params=options)
def convert_pdf_to_markdown(file_path):
output_folder=os.path.dirname(os.path.abspath(file_path))
textin = TextinOcr('77c60b5b961381ba80f427966cdfe8ee', '31261fde2bb4ffed73f13ace24c495b5')
image = get_file_content(file_path)
resp = textin.recognize_pdf2md(image, {
'page_start': 0,
'page_count': 50, # 设置解析页数为50页
'table_flavor': 'md', # html 按html语法输出表格
'parse_mode': 'scan', # 设置解析模式为scan模式
'page_details': 0, # 不包含页面细节
'markdown_details': 1,
'apply_document_tree': 1,
'dpi': 144 # 分辨率设置为144 dpi
})
print("request time: ", resp.elapsed.total_seconds())
data = json.loads(resp.text)
if 'result' in data and 'markdown' in data['result']:
markdown_content = data['result']['markdown']
# 定义输出文件名
output_file = os.path.join(output_folder,"extract1.txt")
try:
# 将 markdown 内容写入文件,使用 UTF-8 编码
with open(output_file, 'w', encoding='utf-8') as file:
file.write(markdown_content)
print(f"Markdown 内容已成功保存到 '{output_file}'")
return output_file
except IOError as e:
print(f"写入文件时出错: {e}")
if __name__ == "__main__":
# file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf"
file_path=r"C:\Users\Administrator\Desktop\货物标\output1\招标文件实高电子显示屏_procurement.pdf"
res=convert_pdf_to_markdown(file_path)
print(res)