zbparse/flask_app/general/file2markdown.py

80 lines
3.3 KiB
Python
Raw Normal View History

2024-11-17 12:11:11 +08:00
import os.path
2024-11-16 14:24:58 +08:00
import requests
import json
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
class TextinOcr(object):
def __init__(self, app_id, app_secret):
self._app_id = app_id
self._app_secret = app_secret
self.host = 'https://api.textin.com'
def recognize_pdf2md(self, image, options):
"""
pdf to markdown
:param options: request params
:param image: file bytes
:return: response
options = {
'pdf_pwd': None, #当pdf为加密文档时需要提供密码。 备注:对前端封装该接口时,需要自行对密码进行安全防护
'dpi': 144, # 设置dpi为144
'page_start': 0, #当上传的是pdf时page_start 表示从第几页开始转
'page_count': 1000, # 设置解析的页数为1000页
'apply_document_tree': 1, #是否生成标题默认为1生成标题
'markdown_details': 1,
'page_details': 0, # 不包含页面细节信息
'table_flavor': 'md', #按md语法输出表格
'get_image': 'none', #获取markdown里的图片默认为none不返回任何图像
'parse_mode': 'scan', # PDF解析模式默认为scan模式仅按文字识别方式处理。
}
"""
url = self.host + '/ai/service/v1/pdf_to_markdown'
headers = {
'x-ti-app-id': self._app_id,
'x-ti-secret-code': self._app_secret
}
return requests.post(url, data=image, headers=headers, params=options)
2024-12-17 14:47:19 +08:00
def convert_file_to_markdown(file_path):
2024-11-17 12:11:11 +08:00
output_folder=os.path.dirname(os.path.abspath(file_path))
2024-12-05 16:53:11 +08:00
app_id=os.getenv("TEXTIN_APP_ID")
app_key=os.getenv("TEXTIN_APP_KEY")
textin = TextinOcr(app_id, app_key)
2024-11-16 14:24:58 +08:00
image = get_file_content(file_path)
resp = textin.recognize_pdf2md(image, {
'page_start': 0,
2024-12-16 13:56:28 +08:00
'page_count': 100, # 设置解析页数为50页
2024-12-12 10:59:51 +08:00
'table_flavor': 'html', # html 按html语法输出表格
'parse_mode': 'auto', # 设置解析模式为scan模式
2024-11-16 14:24:58 +08:00
'page_details': 0, # 不包含页面细节
'markdown_details': 1,
'apply_document_tree': 1,
'dpi': 216, # 分辨率设置默认为144 dpi,
'get_image':'none'
2024-11-16 14:24:58 +08:00
})
print("request time: ", resp.elapsed.total_seconds())
data = json.loads(resp.text)
if 'result' in data and 'markdown' in data['result']:
markdown_content = data['result']['markdown']
# 定义输出文件名
output_file = os.path.join(output_folder,"extract1.txt")
2024-11-16 14:24:58 +08:00
try:
# 将 markdown 内容写入文件,使用 UTF-8 编码
with open(output_file, 'w', encoding='utf-8') as file:
file.write(markdown_content)
print(f"Markdown 内容已成功保存到 '{output_file}'")
2024-11-17 12:11:11 +08:00
return output_file
2024-11-16 14:24:58 +08:00
except IOError as e:
print(f"写入文件时出错: {e}")
2024-11-17 12:11:11 +08:00
if __name__ == "__main__":
2024-12-03 09:07:14 +08:00
# file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf"
file_path=r"C:\Users\Administrator\Desktop\new招标文件\output5\HBDL-2024-0514-001-招标文件_procurement\HBDL-2024-0514-001-招标文件_procurement_1.pdf"
2024-12-17 14:47:19 +08:00
res=convert_file_to_markdown(file_path)
2024-11-17 12:11:11 +08:00
print(res)