2024-11-17 12:11:11 +08:00
|
|
|
|
import os.path
|
2024-11-16 14:24:58 +08:00
|
|
|
|
import requests
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
def get_file_content(filePath):
|
|
|
|
|
with open(filePath, 'rb') as fp:
|
|
|
|
|
return fp.read()
|
|
|
|
|
|
|
|
|
|
class TextinOcr(object):
|
|
|
|
|
def __init__(self, app_id, app_secret):
|
|
|
|
|
self._app_id = app_id
|
|
|
|
|
self._app_secret = app_secret
|
|
|
|
|
self.host = 'https://api.textin.com'
|
|
|
|
|
|
|
|
|
|
def recognize_pdf2md(self, image, options):
|
|
|
|
|
"""
|
|
|
|
|
pdf to markdown
|
|
|
|
|
:param options: request params
|
|
|
|
|
:param image: file bytes
|
|
|
|
|
:return: response
|
|
|
|
|
|
|
|
|
|
options = {
|
|
|
|
|
'pdf_pwd': None, #当pdf为加密文档时,需要提供密码。 备注:对前端封装该接口时,需要自行对密码进行安全防护
|
|
|
|
|
'dpi': 144, # 设置dpi为144
|
|
|
|
|
'page_start': 0, #当上传的是pdf时,page_start 表示从第几页开始转
|
|
|
|
|
'page_count': 1000, # 设置解析的页数为1000页
|
|
|
|
|
'apply_document_tree': 1, #是否生成标题,默认为1,生成标题
|
|
|
|
|
'markdown_details': 1,
|
|
|
|
|
'page_details': 0, # 不包含页面细节信息
|
|
|
|
|
'table_flavor': 'md', #按md语法输出表格
|
|
|
|
|
'get_image': 'none', #获取markdown里的图片,默认为none,不返回任何图像
|
|
|
|
|
'parse_mode': 'scan', # PDF解析模式,默认为scan模式,仅按文字识别方式处理。
|
|
|
|
|
}
|
|
|
|
|
"""
|
|
|
|
|
url = self.host + '/ai/service/v1/pdf_to_markdown'
|
|
|
|
|
headers = {
|
|
|
|
|
'x-ti-app-id': self._app_id,
|
|
|
|
|
'x-ti-secret-code': self._app_secret
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return requests.post(url, data=image, headers=headers, params=options)
|
2025-01-17 10:47:43 +08:00
|
|
|
|
#调用textIn:pdf/word->markdown
|
2024-12-30 10:27:29 +08:00
|
|
|
|
def convert_file_to_markdown(file_path, file_name="extract1.txt"):
|
|
|
|
|
# 获取文件的绝对路径所在的文件夹
|
|
|
|
|
output_folder = os.path.dirname(os.path.abspath(file_path))
|
|
|
|
|
|
|
|
|
|
# 定义输出文件的完整路径
|
|
|
|
|
output_file = os.path.join(output_folder, file_name)
|
|
|
|
|
|
|
|
|
|
# 检查输出文件是否已经存在
|
|
|
|
|
if os.path.exists(output_file):
|
|
|
|
|
print(f"文件 '{output_file}' 已存在,直接返回路径。")
|
|
|
|
|
return output_file # 文件存在,直接返回路径
|
|
|
|
|
|
|
|
|
|
# 如果文件不存在,则继续执行OCR请求
|
|
|
|
|
app_id = os.getenv("TEXTIN_APP_ID")
|
|
|
|
|
app_key = os.getenv("TEXTIN_APP_KEY")
|
|
|
|
|
|
|
|
|
|
# 初始化TextinOcr对象
|
2024-12-05 16:53:11 +08:00
|
|
|
|
textin = TextinOcr(app_id, app_key)
|
2024-12-30 10:27:29 +08:00
|
|
|
|
|
|
|
|
|
# 读取文件内容
|
2024-11-16 14:24:58 +08:00
|
|
|
|
image = get_file_content(file_path)
|
2024-12-30 10:27:29 +08:00
|
|
|
|
|
|
|
|
|
# 发送OCR请求,将PDF转换为Markdown
|
2024-11-16 14:24:58 +08:00
|
|
|
|
resp = textin.recognize_pdf2md(image, {
|
|
|
|
|
'page_start': 0,
|
2024-12-30 10:27:29 +08:00
|
|
|
|
'page_count': 100, # 设置解析页数为100页
|
|
|
|
|
'table_flavor': 'html', # 按HTML语法输出表格
|
2024-12-25 10:16:50 +08:00
|
|
|
|
'parse_mode': 'scan', # 设置解析模式为scan模式
|
2024-11-16 14:24:58 +08:00
|
|
|
|
'page_details': 0, # 不包含页面细节
|
|
|
|
|
'markdown_details': 1,
|
|
|
|
|
'apply_document_tree': 1,
|
2024-12-30 10:27:29 +08:00
|
|
|
|
'dpi': 216, # 分辨率设置为216 dpi
|
|
|
|
|
'get_image': 'none'
|
2024-11-16 14:24:58 +08:00
|
|
|
|
})
|
2024-12-30 10:27:29 +08:00
|
|
|
|
|
2025-01-08 17:34:50 +08:00
|
|
|
|
# 打印请求耗时(网络请求时间)
|
|
|
|
|
print("Request network time: ", resp.elapsed.total_seconds(), "seconds")
|
2024-12-30 10:27:29 +08:00
|
|
|
|
|
|
|
|
|
# 解析响应内容
|
2024-11-16 14:24:58 +08:00
|
|
|
|
data = json.loads(resp.text)
|
2025-01-08 17:34:50 +08:00
|
|
|
|
duration = data.get('duration',"错误!") # 提取 'duration'
|
|
|
|
|
print("实际转换时间(ms):", duration)
|
2024-12-30 10:27:29 +08:00
|
|
|
|
# 检查响应中是否包含Markdown内容
|
2024-11-16 14:24:58 +08:00
|
|
|
|
if 'result' in data and 'markdown' in data['result']:
|
|
|
|
|
markdown_content = data['result']['markdown']
|
2024-12-30 10:27:29 +08:00
|
|
|
|
|
2024-11-16 14:24:58 +08:00
|
|
|
|
try:
|
2024-12-30 10:27:29 +08:00
|
|
|
|
# 将Markdown内容写入文件,使用UTF-8编码
|
2024-11-16 14:24:58 +08:00
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as file:
|
|
|
|
|
file.write(markdown_content)
|
|
|
|
|
print(f"Markdown 内容已成功保存到 '{output_file}'")
|
2024-12-30 10:27:29 +08:00
|
|
|
|
return output_file # 返回输出文件的路径
|
2024-11-16 14:24:58 +08:00
|
|
|
|
except IOError as e:
|
|
|
|
|
print(f"写入文件时出错: {e}")
|
2024-12-30 10:27:29 +08:00
|
|
|
|
return None # 如果写入失败,返回None
|
|
|
|
|
else:
|
|
|
|
|
print("未能从响应中获取Markdown内容。")
|
|
|
|
|
return None # 如果响应中没有Markdown内容,返回None
|
2024-11-17 12:11:11 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2024-12-03 09:07:14 +08:00
|
|
|
|
# file_path=r"C:\Users\Administrator\Desktop\fsdownload\e702f1e6-095d-443d-bb7d-ef2e42037cb1\ztbfile_procurement.pdf"
|
2025-01-09 14:59:28 +08:00
|
|
|
|
file_path=r"C:\Users\Administrator\Desktop\招标文件\tmp_pdf_wo_pages_1.pdf"
|
2024-12-17 14:47:19 +08:00
|
|
|
|
res=convert_file_to_markdown(file_path)
|
2024-11-17 12:11:11 +08:00
|
|
|
|
print(res)
|