74 lines
3.1 KiB
Python
74 lines
3.1 KiB
Python
import requests
|
||
import json
|
||
|
||
def get_file_content(filePath):
|
||
with open(filePath, 'rb') as fp:
|
||
return fp.read()
|
||
|
||
class TextinOcr(object):
|
||
def __init__(self, app_id, app_secret):
|
||
self._app_id = app_id
|
||
self._app_secret = app_secret
|
||
self.host = 'https://api.textin.com'
|
||
|
||
def recognize_pdf2md(self, image, options):
|
||
"""
|
||
pdf to markdown
|
||
:param options: request params
|
||
:param image: file bytes
|
||
:return: response
|
||
|
||
options = {
|
||
'pdf_pwd': None, #当pdf为加密文档时,需要提供密码。 备注:对前端封装该接口时,需要自行对密码进行安全防护
|
||
'dpi': 144, # 设置dpi为144
|
||
'page_start': 0, #当上传的是pdf时,page_start 表示从第几页开始转
|
||
'page_count': 1000, # 设置解析的页数为1000页
|
||
'apply_document_tree': 1, #是否生成标题,默认为1,生成标题
|
||
'markdown_details': 1,
|
||
'page_details': 0, # 不包含页面细节信息
|
||
'table_flavor': 'md', #按md语法输出表格
|
||
'get_image': 'none', #获取markdown里的图片,默认为none,不返回任何图像
|
||
'parse_mode': 'scan', # PDF解析模式,默认为scan模式,仅按文字识别方式处理。
|
||
}
|
||
"""
|
||
url = self.host + '/ai/service/v1/pdf_to_markdown'
|
||
headers = {
|
||
'x-ti-app-id': self._app_id,
|
||
'x-ti-secret-code': self._app_secret
|
||
}
|
||
|
||
return requests.post(url, data=image, headers=headers, params=options)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# 请登录后前往 “工作台-账号设置-开发者信息” 查看 app-id/app-secret
|
||
textin = TextinOcr('77c60b5b961381ba80f427966cdfe8ee', '31261fde2bb4ffed73f13ace24c495b5')
|
||
file_path=r'C:\Users\Administrator\Desktop\货物标\output1\磋商文件_procurement.pdf'
|
||
image = get_file_content(file_path)
|
||
|
||
resp = textin.recognize_pdf2md(image, {
|
||
'page_start': 0,
|
||
'page_count': 50, # 设置解析页数为50页
|
||
'table_flavor': 'md', #html 按html语法输出表格
|
||
'parse_mode': 'scan', # 设置解析模式为scan模式
|
||
'page_details': 0, # 不包含页面细节
|
||
'markdown_details': 1,
|
||
'apply_document_tree': 1,
|
||
'dpi': 144 # 分辨率设置为144 dpi
|
||
})
|
||
print("request time: ", resp.elapsed.total_seconds())
|
||
data = json.loads(resp.text)
|
||
if 'result' in data and 'markdown' in data['result']:
|
||
markdown_content = data['result']['markdown']
|
||
# 定义输出文件名
|
||
output_file = 'output1.txt'
|
||
try:
|
||
# 将 markdown 内容写入文件,使用 UTF-8 编码
|
||
with open(output_file, 'w', encoding='utf-8') as file:
|
||
file.write(markdown_content)
|
||
print(f"Markdown 内容已成功保存到 '{output_file}'")
|
||
except IOError as e:
|
||
print(f"写入文件时出错: {e}")
|
||
# with open('result.json', 'w', encoding='utf-8') as fw:
|
||
# json.dump(result, fw, indent=4, ensure_ascii=False)
|