12.2 清理bug修改
This commit is contained in:
parent
58f737f11b
commit
643fc904a3
BIN
flask_app/PaddleOCR/PaddleOCR-json.exe
Normal file
BIN
flask_app/PaddleOCR/PaddleOCR-json.exe
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/api-ms-win-core-libraryloader-l1-2-0.dll
Normal file
BIN
flask_app/PaddleOCR/api-ms-win-core-libraryloader-l1-2-0.dll
Normal file
Binary file not shown.
Binary file not shown.
BIN
flask_app/PaddleOCR/api-ms-win-eventing-provider-l1-1-0.dll
Normal file
BIN
flask_app/PaddleOCR/api-ms-win-eventing-provider-l1-1-0.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/concrt140.dll
Normal file
BIN
flask_app/PaddleOCR/concrt140.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/libiomp5md.dll
Normal file
BIN
flask_app/PaddleOCR/libiomp5md.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/mkldnn.dll
Normal file
BIN
flask_app/PaddleOCR/mkldnn.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/mklml.dll
Normal file
BIN
flask_app/PaddleOCR/mklml.dll
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
13
flask_app/PaddleOCR/models/config_chinese.txt
Normal file
13
flask_app/PaddleOCR/models/config_chinese.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# 简中 PP-OCR v3
|
||||||
|
|
||||||
|
# det 检测模型库
|
||||||
|
det_model_dir models/ch_PP-OCRv3_det_infer
|
||||||
|
|
||||||
|
# cls 方向分类器库
|
||||||
|
cls_model_dir models/ch_ppocr_mobile_v2.0_cls_infer
|
||||||
|
|
||||||
|
# rec 识别模型库
|
||||||
|
rec_model_dir models/ch_PP-OCRv3_rec_infer
|
||||||
|
|
||||||
|
# 字典路径
|
||||||
|
rec_char_dict_path models/dict_chinese.txt
|
16
flask_app/PaddleOCR/models/config_chinese_cht.txt
Normal file
16
flask_app/PaddleOCR/models/config_chinese_cht.txt
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# 繁中 PP-OCR v2
|
||||||
|
|
||||||
|
# det 检测模型库
|
||||||
|
det_model_dir models/ch_PP-OCRv3_det_infer
|
||||||
|
|
||||||
|
# cls 方向分类器库
|
||||||
|
cls_model_dir models/ch_ppocr_mobile_v2.0_cls_infer
|
||||||
|
|
||||||
|
# rec 识别模型库
|
||||||
|
rec_model_dir models/chinese_cht_mobile_v2.0_rec_infer
|
||||||
|
|
||||||
|
# 字典路径
|
||||||
|
rec_char_dict_path models/dict_chinese_cht.txt
|
||||||
|
|
||||||
|
# v2模型启用
|
||||||
|
rec_img_h 32
|
13
flask_app/PaddleOCR/models/config_cyrillic.txt
Normal file
13
flask_app/PaddleOCR/models/config_cyrillic.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# 斯拉夫字母(俄语、白俄罗斯语、乌克兰语等) PP-OCR v3
|
||||||
|
|
||||||
|
# det 检测模型库
|
||||||
|
det_model_dir models/ch_PP-OCRv3_det_infer
|
||||||
|
|
||||||
|
# cls 方向分类器库
|
||||||
|
cls_model_dir models/ch_ppocr_mobile_v2.0_cls_infer
|
||||||
|
|
||||||
|
# rec 识别模型库
|
||||||
|
rec_model_dir models/cyrillic_PP-OCRv3_rec_infer
|
||||||
|
|
||||||
|
# 字典路径
|
||||||
|
rec_char_dict_path models/dict_cyrillic.txt
|
13
flask_app/PaddleOCR/models/config_en.txt
Normal file
13
flask_app/PaddleOCR/models/config_en.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# 纯英文 PP-OCR v3
|
||||||
|
|
||||||
|
# det 检测模型库
|
||||||
|
det_model_dir models/ch_PP-OCRv3_det_infer
|
||||||
|
|
||||||
|
# cls 方向分类器库
|
||||||
|
cls_model_dir models/ch_ppocr_mobile_v2.0_cls_infer
|
||||||
|
|
||||||
|
# rec 识别模型库
|
||||||
|
rec_model_dir models/en_PP-OCRv3_rec_infer
|
||||||
|
|
||||||
|
# 字典路径
|
||||||
|
rec_char_dict_path models/dict_en.txt
|
13
flask_app/PaddleOCR/models/config_japan.txt
Normal file
13
flask_app/PaddleOCR/models/config_japan.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# 日文 PP-OCR v3
|
||||||
|
|
||||||
|
# det 检测模型库
|
||||||
|
det_model_dir models/ch_PP-OCRv3_det_infer
|
||||||
|
|
||||||
|
# cls 方向分类器库
|
||||||
|
cls_model_dir models/ch_ppocr_mobile_v2.0_cls_infer
|
||||||
|
|
||||||
|
# rec 识别模型库
|
||||||
|
rec_model_dir models/japan_PP-OCRv3_rec_infer
|
||||||
|
|
||||||
|
# 字典路径
|
||||||
|
rec_char_dict_path models/dict_japan.txt
|
13
flask_app/PaddleOCR/models/config_korean.txt
Normal file
13
flask_app/PaddleOCR/models/config_korean.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
# 韩文 PP-OCR v3
|
||||||
|
|
||||||
|
# det 检测模型库
|
||||||
|
det_model_dir models/ch_PP-OCRv3_det_infer
|
||||||
|
|
||||||
|
# cls 方向分类器库
|
||||||
|
cls_model_dir models/ch_ppocr_mobile_v2.0_cls_infer
|
||||||
|
|
||||||
|
# rec 识别模型库
|
||||||
|
rec_model_dir models/korean_PP-OCRv3_rec_infer
|
||||||
|
|
||||||
|
# 字典路径
|
||||||
|
rec_char_dict_path models/dict_korean.txt
|
6
flask_app/PaddleOCR/models/configs.txt
Normal file
6
flask_app/PaddleOCR/models/configs.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
config_chinese.txt 简体中文
|
||||||
|
config_en.txt English
|
||||||
|
config_chinese_cht.txt 繁體中文
|
||||||
|
config_japan.txt 日本語
|
||||||
|
config_korean.txt 한국어
|
||||||
|
config_cyrillic.txt Русский
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
6623
flask_app/PaddleOCR/models/dict_chinese.txt
Normal file
6623
flask_app/PaddleOCR/models/dict_chinese.txt
Normal file
File diff suppressed because it is too large
Load Diff
8421
flask_app/PaddleOCR/models/dict_chinese_cht.txt
Normal file
8421
flask_app/PaddleOCR/models/dict_chinese_cht.txt
Normal file
File diff suppressed because it is too large
Load Diff
163
flask_app/PaddleOCR/models/dict_cyrillic.txt
Normal file
163
flask_app/PaddleOCR/models/dict_cyrillic.txt
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
|
||||||
|
!
|
||||||
|
#
|
||||||
|
$
|
||||||
|
%
|
||||||
|
&
|
||||||
|
'
|
||||||
|
(
|
||||||
|
+
|
||||||
|
,
|
||||||
|
-
|
||||||
|
.
|
||||||
|
/
|
||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
4
|
||||||
|
5
|
||||||
|
6
|
||||||
|
7
|
||||||
|
8
|
||||||
|
9
|
||||||
|
:
|
||||||
|
?
|
||||||
|
@
|
||||||
|
A
|
||||||
|
B
|
||||||
|
C
|
||||||
|
D
|
||||||
|
E
|
||||||
|
F
|
||||||
|
G
|
||||||
|
H
|
||||||
|
I
|
||||||
|
J
|
||||||
|
K
|
||||||
|
L
|
||||||
|
M
|
||||||
|
N
|
||||||
|
O
|
||||||
|
P
|
||||||
|
Q
|
||||||
|
R
|
||||||
|
S
|
||||||
|
T
|
||||||
|
U
|
||||||
|
V
|
||||||
|
W
|
||||||
|
X
|
||||||
|
Y
|
||||||
|
Z
|
||||||
|
_
|
||||||
|
a
|
||||||
|
b
|
||||||
|
c
|
||||||
|
d
|
||||||
|
e
|
||||||
|
f
|
||||||
|
g
|
||||||
|
h
|
||||||
|
i
|
||||||
|
j
|
||||||
|
k
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
o
|
||||||
|
p
|
||||||
|
q
|
||||||
|
r
|
||||||
|
s
|
||||||
|
t
|
||||||
|
u
|
||||||
|
v
|
||||||
|
w
|
||||||
|
x
|
||||||
|
y
|
||||||
|
z
|
||||||
|
É
|
||||||
|
é
|
||||||
|
Ё
|
||||||
|
Є
|
||||||
|
І
|
||||||
|
Ј
|
||||||
|
Љ
|
||||||
|
Ў
|
||||||
|
А
|
||||||
|
Б
|
||||||
|
В
|
||||||
|
Г
|
||||||
|
Д
|
||||||
|
Е
|
||||||
|
Ж
|
||||||
|
З
|
||||||
|
И
|
||||||
|
Й
|
||||||
|
К
|
||||||
|
Л
|
||||||
|
М
|
||||||
|
Н
|
||||||
|
О
|
||||||
|
П
|
||||||
|
Р
|
||||||
|
С
|
||||||
|
Т
|
||||||
|
У
|
||||||
|
Ф
|
||||||
|
Х
|
||||||
|
Ц
|
||||||
|
Ч
|
||||||
|
Ш
|
||||||
|
Щ
|
||||||
|
Ъ
|
||||||
|
Ы
|
||||||
|
Ь
|
||||||
|
Э
|
||||||
|
Ю
|
||||||
|
Я
|
||||||
|
а
|
||||||
|
б
|
||||||
|
в
|
||||||
|
г
|
||||||
|
д
|
||||||
|
е
|
||||||
|
ж
|
||||||
|
з
|
||||||
|
и
|
||||||
|
й
|
||||||
|
к
|
||||||
|
л
|
||||||
|
м
|
||||||
|
н
|
||||||
|
о
|
||||||
|
п
|
||||||
|
р
|
||||||
|
с
|
||||||
|
т
|
||||||
|
у
|
||||||
|
ф
|
||||||
|
х
|
||||||
|
ц
|
||||||
|
ч
|
||||||
|
ш
|
||||||
|
щ
|
||||||
|
ъ
|
||||||
|
ы
|
||||||
|
ь
|
||||||
|
э
|
||||||
|
ю
|
||||||
|
я
|
||||||
|
ё
|
||||||
|
ђ
|
||||||
|
є
|
||||||
|
і
|
||||||
|
ј
|
||||||
|
љ
|
||||||
|
њ
|
||||||
|
ћ
|
||||||
|
ў
|
||||||
|
џ
|
||||||
|
Ґ
|
||||||
|
ґ
|
95
flask_app/PaddleOCR/models/dict_en.txt
Normal file
95
flask_app/PaddleOCR/models/dict_en.txt
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
0
|
||||||
|
1
|
||||||
|
2
|
||||||
|
3
|
||||||
|
4
|
||||||
|
5
|
||||||
|
6
|
||||||
|
7
|
||||||
|
8
|
||||||
|
9
|
||||||
|
:
|
||||||
|
;
|
||||||
|
<
|
||||||
|
=
|
||||||
|
>
|
||||||
|
?
|
||||||
|
@
|
||||||
|
A
|
||||||
|
B
|
||||||
|
C
|
||||||
|
D
|
||||||
|
E
|
||||||
|
F
|
||||||
|
G
|
||||||
|
H
|
||||||
|
I
|
||||||
|
J
|
||||||
|
K
|
||||||
|
L
|
||||||
|
M
|
||||||
|
N
|
||||||
|
O
|
||||||
|
P
|
||||||
|
Q
|
||||||
|
R
|
||||||
|
S
|
||||||
|
T
|
||||||
|
U
|
||||||
|
V
|
||||||
|
W
|
||||||
|
X
|
||||||
|
Y
|
||||||
|
Z
|
||||||
|
[
|
||||||
|
\
|
||||||
|
]
|
||||||
|
^
|
||||||
|
_
|
||||||
|
`
|
||||||
|
a
|
||||||
|
b
|
||||||
|
c
|
||||||
|
d
|
||||||
|
e
|
||||||
|
f
|
||||||
|
g
|
||||||
|
h
|
||||||
|
i
|
||||||
|
j
|
||||||
|
k
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
o
|
||||||
|
p
|
||||||
|
q
|
||||||
|
r
|
||||||
|
s
|
||||||
|
t
|
||||||
|
u
|
||||||
|
v
|
||||||
|
w
|
||||||
|
x
|
||||||
|
y
|
||||||
|
z
|
||||||
|
{
|
||||||
|
|
|
||||||
|
}
|
||||||
|
~
|
||||||
|
!
|
||||||
|
"
|
||||||
|
#
|
||||||
|
$
|
||||||
|
%
|
||||||
|
&
|
||||||
|
'
|
||||||
|
(
|
||||||
|
)
|
||||||
|
*
|
||||||
|
+
|
||||||
|
,
|
||||||
|
-
|
||||||
|
.
|
||||||
|
/
|
||||||
|
|
4399
flask_app/PaddleOCR/models/dict_japan.txt
Normal file
4399
flask_app/PaddleOCR/models/dict_japan.txt
Normal file
File diff suppressed because it is too large
Load Diff
3688
flask_app/PaddleOCR/models/dict_korean.txt
Normal file
3688
flask_app/PaddleOCR/models/dict_korean.txt
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
flask_app/PaddleOCR/msvcp140.dll
Normal file
BIN
flask_app/PaddleOCR/msvcp140.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/onnxruntime.dll
Normal file
BIN
flask_app/PaddleOCR/onnxruntime.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/opencv_world4100.dll
Normal file
BIN
flask_app/PaddleOCR/opencv_world4100.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/paddle2onnx.dll
Normal file
BIN
flask_app/PaddleOCR/paddle2onnx.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/paddle_inference.dll
Normal file
BIN
flask_app/PaddleOCR/paddle_inference.dll
Normal file
Binary file not shown.
7
flask_app/PaddleOCR/python_api/.gitignore
vendored
Normal file
7
flask_app/PaddleOCR/python_api/.gitignore
vendored
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# 无视所有图片
|
||||||
|
*.png
|
||||||
|
*.jpg
|
||||||
|
*.jpeg
|
||||||
|
|
||||||
|
# 除了 test.jpg
|
||||||
|
!test.jpg
|
337
flask_app/PaddleOCR/python_api/PPOCR_api.py
Normal file
337
flask_app/PaddleOCR/python_api/PPOCR_api.py
Normal file
@ -0,0 +1,337 @@
|
|||||||
|
# 调用 PaddleOCR-json.exe 的 Python Api
|
||||||
|
# 项目主页:
|
||||||
|
# https://github.com/hiroi-sora/PaddleOCR-json
|
||||||
|
|
||||||
|
import os
|
||||||
|
import socket # 套接字
|
||||||
|
import atexit # 退出处理
|
||||||
|
import subprocess # 进程,管道
|
||||||
|
import re # regex
|
||||||
|
from json import loads as jsonLoads, dumps as jsonDumps
|
||||||
|
from sys import platform as sysPlatform # popen静默模式
|
||||||
|
from base64 import b64encode # base64 编码
|
||||||
|
|
||||||
|
|
||||||
|
class PPOCR_pipe: # 调用OCR(管道模式)
|
||||||
|
def __init__(self, exePath: str, modelsPath: str = None, argument: dict = None):
|
||||||
|
"""初始化识别器(管道模式)。\n
|
||||||
|
`exePath`: 识别器`PaddleOCR_json.exe`的路径。\n
|
||||||
|
`modelsPath`: 识别库`models`文件夹的路径。若为None则默认识别库与识别器在同一目录下。\n
|
||||||
|
`argument`: 启动参数,字典`{"键":值}`。参数说明见 https://github.com/hiroi-sora/PaddleOCR-json
|
||||||
|
"""
|
||||||
|
# 私有成员变量
|
||||||
|
self.__ENABLE_CLIPBOARD = False
|
||||||
|
|
||||||
|
exePath = os.path.abspath(exePath)
|
||||||
|
cwd = os.path.abspath(os.path.join(exePath, os.pardir)) # 获取exe父文件夹
|
||||||
|
cmds = [exePath]
|
||||||
|
# 处理启动参数
|
||||||
|
if modelsPath is not None:
|
||||||
|
if os.path.exists(modelsPath) and os.path.isdir(modelsPath):
|
||||||
|
cmds += ["--models_path", os.path.abspath(modelsPath)]
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
f"Input modelsPath doesn't exits or isn't a directory. modelsPath: [{modelsPath}]"
|
||||||
|
)
|
||||||
|
if isinstance(argument, dict):
|
||||||
|
for key, value in argument.items():
|
||||||
|
# Popen() 要求输入list里所有的元素都是 str 或 bytes
|
||||||
|
if isinstance(value, bool):
|
||||||
|
cmds += [f"--{key}={value}"] # 布尔参数必须键和值连在一起
|
||||||
|
elif isinstance(value, str):
|
||||||
|
cmds += [f"--{key}", value]
|
||||||
|
else:
|
||||||
|
cmds += [f"--{key}", str(value)]
|
||||||
|
# 设置子进程启用静默模式,不显示控制台窗口
|
||||||
|
self.ret = None
|
||||||
|
startupinfo = None
|
||||||
|
if "win32" in str(sysPlatform).lower():
|
||||||
|
startupinfo = subprocess.STARTUPINFO()
|
||||||
|
startupinfo.dwFlags = (
|
||||||
|
subprocess.CREATE_NEW_CONSOLE | subprocess.STARTF_USESHOWWINDOW
|
||||||
|
)
|
||||||
|
startupinfo.wShowWindow = subprocess.SW_HIDE
|
||||||
|
self.ret = subprocess.Popen( # 打开管道
|
||||||
|
cmds,
|
||||||
|
cwd=cwd,
|
||||||
|
stdin=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.DEVNULL, # 丢弃stderr的内容
|
||||||
|
startupinfo=startupinfo, # 开启静默模式
|
||||||
|
)
|
||||||
|
# 启动子进程
|
||||||
|
while True:
|
||||||
|
if not self.ret.poll() == None: # 子进程已退出,初始化失败
|
||||||
|
raise Exception(f"OCR init fail.")
|
||||||
|
initStr = self.ret.stdout.readline().decode("utf-8", errors="ignore")
|
||||||
|
if "OCR init completed." in initStr: # 初始化成功
|
||||||
|
break
|
||||||
|
elif "OCR clipboard enbaled." in initStr: # 检测到剪贴板已启用
|
||||||
|
self.__ENABLE_CLIPBOARD = True
|
||||||
|
atexit.register(self.exit) # 注册程序终止时执行强制停止子进程
|
||||||
|
|
||||||
|
def isClipboardEnabled(self) -> bool:
|
||||||
|
return self.__ENABLE_CLIPBOARD
|
||||||
|
|
||||||
|
def getRunningMode(self) -> str:
|
||||||
|
# 默认管道模式只能运行在本地
|
||||||
|
return "local"
|
||||||
|
|
||||||
|
def runDict(self, writeDict: dict):
|
||||||
|
"""传入指令字典,发送给引擎进程。\n
|
||||||
|
`writeDict`: 指令字典。\n
|
||||||
|
`return`: {"code": 识别码, "data": 内容列表或错误信息字符串}\n"""
|
||||||
|
# 检查子进程
|
||||||
|
if not self.ret:
|
||||||
|
return {"code": 901, "data": f"引擎实例不存在。"}
|
||||||
|
if not self.ret.poll() == None:
|
||||||
|
return {"code": 902, "data": f"子进程已崩溃。"}
|
||||||
|
# 输入信息
|
||||||
|
writeStr = jsonDumps(writeDict, ensure_ascii=True, indent=None) + "\n"
|
||||||
|
try:
|
||||||
|
self.ret.stdin.write(writeStr.encode("utf-8"))
|
||||||
|
self.ret.stdin.flush()
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"code": 902,
|
||||||
|
"data": f"向识别器进程传入指令失败,疑似子进程已崩溃。{e}",
|
||||||
|
}
|
||||||
|
# 获取返回值
|
||||||
|
try:
|
||||||
|
getStr = self.ret.stdout.readline().decode("utf-8", errors="ignore")
|
||||||
|
except Exception as e:
|
||||||
|
return {"code": 903, "data": f"读取识别器进程输出值失败。异常信息:[{e}]"}
|
||||||
|
try:
|
||||||
|
return jsonLoads(getStr)
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"code": 904,
|
||||||
|
"data": f"识别器输出值反序列化JSON失败。异常信息:[{e}]。原始内容:[{getStr}]",
|
||||||
|
}
|
||||||
|
|
||||||
|
def run(self, imgPath: str):
|
||||||
|
"""对一张本地图片进行文字识别。\n
|
||||||
|
`exePath`: 图片路径。\n
|
||||||
|
`return`: {"code": 识别码, "data": 内容列表或错误信息字符串}\n"""
|
||||||
|
writeDict = {"image_path": imgPath}
|
||||||
|
return self.runDict(writeDict)
|
||||||
|
|
||||||
|
def runClipboard(self):
|
||||||
|
"""立刻对剪贴板第一位的图片进行文字识别。\n
|
||||||
|
`return`: {"code": 识别码, "data": 内容列表或错误信息字符串}\n"""
|
||||||
|
if self.__ENABLE_CLIPBOARD:
|
||||||
|
return self.run("clipboard")
|
||||||
|
else:
|
||||||
|
raise Exception("剪贴板功能不存在或已禁用。")
|
||||||
|
|
||||||
|
def runBase64(self, imageBase64: str):
|
||||||
|
"""对一张编码为base64字符串的图片进行文字识别。\n
|
||||||
|
`imageBase64`: 图片base64字符串。\n
|
||||||
|
`return`: {"code": 识别码, "data": 内容列表或错误信息字符串}\n"""
|
||||||
|
writeDict = {"image_base64": imageBase64}
|
||||||
|
return self.runDict(writeDict)
|
||||||
|
|
||||||
|
def runBytes(self, imageBytes):
|
||||||
|
"""对一张图片的字节流信息进行文字识别。\n
|
||||||
|
`imageBytes`: 图片字节流。\n
|
||||||
|
`return`: {"code": 识别码, "data": 内容列表或错误信息字符串}\n"""
|
||||||
|
imageBase64 = b64encode(imageBytes).decode("utf-8")
|
||||||
|
return self.runBase64(imageBase64)
|
||||||
|
|
||||||
|
def exit(self):
|
||||||
|
"""关闭引擎子进程"""
|
||||||
|
if hasattr(self, "ret"):
|
||||||
|
if not self.ret:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self.ret.kill() # 关闭子进程
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[Error] ret.kill() {e}")
|
||||||
|
self.ret = None
|
||||||
|
atexit.unregister(self.exit) # 移除退出处理
|
||||||
|
print("### PPOCR引擎子进程关闭!")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def printResult(res: dict):
|
||||||
|
"""用于调试,格式化打印识别结果。\n
|
||||||
|
`res`: OCR识别结果。"""
|
||||||
|
|
||||||
|
# 识别成功
|
||||||
|
if res["code"] == 100:
|
||||||
|
index = 1
|
||||||
|
for line in res["data"]:
|
||||||
|
print(
|
||||||
|
f"{index}-置信度:{round(line['score'], 2)},文本:{line['text']}",
|
||||||
|
end="\\n\n" if line.get("end", "") == "\n" else "\n",
|
||||||
|
)
|
||||||
|
index += 1
|
||||||
|
elif res["code"] == 100:
|
||||||
|
print("图片中未识别出文字。")
|
||||||
|
else:
|
||||||
|
print(f"图片识别失败。错误码:{res['code']},错误信息:{res['data']}")
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.exit()
|
||||||
|
|
||||||
|
|
||||||
|
class PPOCR_socket(PPOCR_pipe):
|
||||||
|
"""调用OCR(套接字模式)"""
|
||||||
|
|
||||||
|
def __init__(self, exePath: str, modelsPath: str = None, argument: dict = None):
|
||||||
|
"""初始化识别器(套接字模式)。\n
|
||||||
|
`exePath`: 识别器`PaddleOCR_json.exe`的路径。\n
|
||||||
|
`modelsPath`: 识别库`models`文件夹的路径。若为None则默认识别库与识别器在同一目录下。\n
|
||||||
|
`argument`: 启动参数,字典`{"键":值}`。参数说明见 https://github.com/hiroi-sora/PaddleOCR-json
|
||||||
|
"""
|
||||||
|
# 处理参数
|
||||||
|
if not argument:
|
||||||
|
argument = {}
|
||||||
|
if "port" not in argument:
|
||||||
|
argument["port"] = 0 # 随机端口号
|
||||||
|
if "addr" not in argument:
|
||||||
|
argument["addr"] = "loopback" # 本地环回地址
|
||||||
|
|
||||||
|
# 处理输入的路径,可能为本地或远程路径
|
||||||
|
self.__runningMode = self.__configureExePath(exePath)
|
||||||
|
|
||||||
|
# 如果为本地路径:使用 PPOCR_pipe 来开启本地引擎进程
|
||||||
|
if self.__runningMode == "local":
|
||||||
|
super().__init__(self.exePath, modelsPath, argument) # 父类构造函数
|
||||||
|
self.__ENABLE_CLIPBOARD = super().isClipboardEnabled()
|
||||||
|
# 再获取一行输出,检查是否成功启动服务器
|
||||||
|
initStr = self.ret.stdout.readline().decode("utf-8", errors="ignore")
|
||||||
|
if not self.ret.poll() == None: # 子进程已退出,初始化失败
|
||||||
|
raise Exception(f"Socket init fail.")
|
||||||
|
if "Socket init completed. " in initStr: # 初始化成功
|
||||||
|
splits = initStr.split(":")
|
||||||
|
self.ip = splits[0].split("Socket init completed. ")[1]
|
||||||
|
self.port = int(splits[1]) # 提取端口号
|
||||||
|
self.ret.stdout.close() # 关闭管道重定向,防止缓冲区填满导致堵塞
|
||||||
|
print(f"套接字服务器初始化成功。{self.ip}:{self.port}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 如果为远程路径:直接连接
|
||||||
|
elif self.__runningMode == "remote":
|
||||||
|
self.__ENABLE_CLIPBOARD = False
|
||||||
|
# 发送一个空指令,检测远程服务器可用性
|
||||||
|
testServer = self.runDict({})
|
||||||
|
if testServer["code"] in [902, 903, 904]:
|
||||||
|
raise Exception(f"Socket connection fail.")
|
||||||
|
print(f"套接字服务器连接成功。{self.ip}:{self.port}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 异常
|
||||||
|
self.exit()
|
||||||
|
raise Exception(f"Socket init fail.")
|
||||||
|
|
||||||
|
def isClipboardEnabled(self) -> bool:
|
||||||
|
return self.__ENABLE_CLIPBOARD
|
||||||
|
|
||||||
|
def getRunningMode(self) -> str:
|
||||||
|
return self.__runningMode
|
||||||
|
|
||||||
|
def runDict(self, writeDict: dict):
|
||||||
|
"""传入指令字典,发送给引擎进程。\n
|
||||||
|
`writeDict`: 指令字典。\n
|
||||||
|
`return`: {"code": 识别码, "data": 内容列表或错误信息字符串}\n"""
|
||||||
|
|
||||||
|
# 仅在本地模式下检查引擎进程
|
||||||
|
if self.__runningMode == "local":
|
||||||
|
# 检查子进程
|
||||||
|
if not self.ret.poll() == None:
|
||||||
|
return {"code": 901, "data": f"子进程已崩溃。"}
|
||||||
|
|
||||||
|
# 通信
|
||||||
|
writeStr = jsonDumps(writeDict, ensure_ascii=True, indent=None) + "\n"
|
||||||
|
try:
|
||||||
|
# 创建TCP连接
|
||||||
|
clientSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
clientSocket.connect((self.ip, self.port))
|
||||||
|
# 发送数据
|
||||||
|
clientSocket.sendall(writeStr.encode())
|
||||||
|
# 发送完所有数据,关闭我方套接字,之后只能从服务器读取数据
|
||||||
|
clientSocket.shutdown(socket.SHUT_WR)
|
||||||
|
# 接收数据
|
||||||
|
resData = b""
|
||||||
|
while True:
|
||||||
|
chunk = clientSocket.recv(1024)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
resData += chunk
|
||||||
|
getStr = resData.decode()
|
||||||
|
except ConnectionRefusedError:
|
||||||
|
return {"code": 902, "data": "连接被拒绝"}
|
||||||
|
except TimeoutError:
|
||||||
|
return {"code": 903, "data": "连接超时"}
|
||||||
|
except Exception as e:
|
||||||
|
return {"code": 904, "data": f"网络错误:{e}"}
|
||||||
|
finally:
|
||||||
|
clientSocket.close() # 关闭连接
|
||||||
|
# 反序列输出信息
|
||||||
|
try:
|
||||||
|
return jsonLoads(getStr)
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"code": 905,
|
||||||
|
"data": f"识别器输出值反序列化JSON失败。异常信息:[{e}]。原始内容:[{getStr}]",
|
||||||
|
}
|
||||||
|
|
||||||
|
def exit(self):
|
||||||
|
"""关闭引擎子进程"""
|
||||||
|
# 仅在本地模式下关闭引擎进程
|
||||||
|
if hasattr(self, "ret"):
|
||||||
|
if self.__runningMode == "local":
|
||||||
|
if not self.ret:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
self.ret.kill() # 关闭子进程
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[Error] ret.kill() {e}")
|
||||||
|
self.ret = None
|
||||||
|
|
||||||
|
self.ip = None
|
||||||
|
self.port = None
|
||||||
|
atexit.unregister(self.exit) # 移除退出处理
|
||||||
|
print("### PPOCR引擎子进程关闭!")
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
self.exit()
|
||||||
|
|
||||||
|
def __configureExePath(self, exePath: str) -> str:
|
||||||
|
"""处理识别器路径,自动区分本地路径和远程路径"""
|
||||||
|
|
||||||
|
pattern = r"remote://(.*):(\d+)"
|
||||||
|
match = re.search(pattern, exePath)
|
||||||
|
try:
|
||||||
|
if match: # 远程模式
|
||||||
|
self.ip = match.group(1)
|
||||||
|
self.port = int(match.group(2))
|
||||||
|
if self.ip == "any":
|
||||||
|
self.ip = "0.0.0.0"
|
||||||
|
elif self.ip == "loopback":
|
||||||
|
self.ip = "127.0.0.1"
|
||||||
|
return "remote"
|
||||||
|
else: # 本地模式
|
||||||
|
self.exePath = exePath
|
||||||
|
return "local"
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def GetOcrApi(
|
||||||
|
exePath: str, modelsPath: str = None, argument: dict = None, ipcMode: str = "pipe"
|
||||||
|
):
|
||||||
|
"""获取识别器API对象。\n
|
||||||
|
`exePath`: 识别器`PaddleOCR_json.exe`的路径。\n
|
||||||
|
`modelsPath`: 识别库`models`文件夹的路径。若为None则默认识别库与识别器在同一目录下。\n
|
||||||
|
`argument`: 启动参数,字典`{"键":值}`。参数说明见 https://github.com/hiroi-sora/PaddleOCR-json\n
|
||||||
|
`ipcMode`: 进程通信模式,可选值为套接字模式`socket` 或 管道模式`pipe`。用法上完全一致。
|
||||||
|
"""
|
||||||
|
if ipcMode == "socket":
|
||||||
|
return PPOCR_socket(exePath, modelsPath, argument)
|
||||||
|
elif ipcMode == "pipe":
|
||||||
|
return PPOCR_pipe(exePath, modelsPath, argument)
|
||||||
|
else:
|
||||||
|
raise Exception(
|
||||||
|
f'ipcMode可选值为 套接字模式"socket" 或 管道模式"pipe" ,不允许{ipcMode}。'
|
||||||
|
)
|
159
flask_app/PaddleOCR/python_api/PPOCR_visualize.py
Normal file
159
flask_app/PaddleOCR/python_api/PPOCR_visualize.py
Normal file
@ -0,0 +1,159 @@
|
|||||||
|
# 将 PaddleOCR-json 结果可视化表现
|
||||||
|
# 项目主页:
|
||||||
|
# https://github.com/hiroi-sora/PaddleOCR-json
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
class visualize:
|
||||||
|
"""可视化"""
|
||||||
|
|
||||||
|
# ================================ 静态方法 ================================
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def createBox(textBlocks, size, fill="#00500040", outline="#11ff22", width=6):
|
||||||
|
"""创建包围盒图层,返回PIL Image对象。\n
|
||||||
|
:textBlocks: 文本块列表。\n
|
||||||
|
:size: 图片尺寸。\n
|
||||||
|
以下为可选字段:(颜色为十六进制6位RGB或8位RGBA字符串,如 #112233ff)\n
|
||||||
|
:fill: 包围盒填充颜色。\n
|
||||||
|
:outline: 包围盒轮廓颜色。\n
|
||||||
|
:width: 包围盒轮廓粗细,像素。
|
||||||
|
"""
|
||||||
|
img = Image.new("RGBA", size, 0)
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
for tb in textBlocks:
|
||||||
|
box = [
|
||||||
|
tuple(tb["box"][0]),
|
||||||
|
tuple(tb["box"][1]),
|
||||||
|
tuple(tb["box"][2]),
|
||||||
|
tuple(tb["box"][3]),
|
||||||
|
]
|
||||||
|
draw.polygon(box, fill=fill, outline=outline, width=width)
|
||||||
|
return img
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def createText(
|
||||||
|
textBlocks,
|
||||||
|
size,
|
||||||
|
ttfPath="C:\Windows\Fonts\msyh.ttc",
|
||||||
|
ttfScale=0.9,
|
||||||
|
fill="#ff0000",
|
||||||
|
):
|
||||||
|
"""创建文字图层,返回PIL Image对象。\n
|
||||||
|
:textBlocks: 文本块列表。\n
|
||||||
|
:size: 图片尺寸。\n
|
||||||
|
以下为可选字段:\n
|
||||||
|
:ttfPath: 字体文件路径。默认为微软雅黑,若不存在此字体会报错。\n
|
||||||
|
:ttfScale: 字体大小整体缩放系数,应在1附近。\n
|
||||||
|
:fill: 文字颜色,十六进制6位RGB或8位RGBA字符串,如 #112233ff。\n
|
||||||
|
"""
|
||||||
|
img = Image.new("RGBA", size, 0)
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
ttfDict = {} # 缓存不同大小的字体对象
|
||||||
|
for tb in textBlocks:
|
||||||
|
text = tb["text"]
|
||||||
|
xy = tuple(tb["box"][0]) # 左上角坐标
|
||||||
|
xy1 = tb["box"][3] # 左下角坐标# 行高
|
||||||
|
hight = round(
|
||||||
|
math.sqrt(((xy[0] - xy1[0]) ** 2) + ((xy[1] - xy1[1]) ** 2)) * ttfScale
|
||||||
|
)
|
||||||
|
if hight not in ttfDict:
|
||||||
|
ttfDict[hight] = ImageFont.truetype(ttfPath, hight) # 创建新大小的字体
|
||||||
|
draw.text(xy, text, font=ttfDict[hight], fill=fill)
|
||||||
|
return img
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def createOrder(
|
||||||
|
textBlocks,
|
||||||
|
size,
|
||||||
|
ttfPath="C:\Windows\Fonts\msyh.ttc",
|
||||||
|
ttfSize=50,
|
||||||
|
fill="#2233ff",
|
||||||
|
bg="#ffffffe0",
|
||||||
|
):
|
||||||
|
"""创建序号图层,返回PIL Image对象。\n
|
||||||
|
:textBlocks: 文本块列表。\n
|
||||||
|
:size: 图片尺寸。\n
|
||||||
|
以下为可选字段:\n
|
||||||
|
:ttfPath: 字体文件路径。默认为微软雅黑,若不存在此字体会报错。\n
|
||||||
|
:ttfSize: 字体大小。\n
|
||||||
|
:fill: 文字颜色,十六进制6位RGB或8位RGBA字符串,如 #112233ff。\n
|
||||||
|
"""
|
||||||
|
img = Image.new("RGBA", size, 0)
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
ttf = ImageFont.truetype(ttfPath, ttfSize) # 字体
|
||||||
|
for index, tb in enumerate(textBlocks):
|
||||||
|
text = f"{index+1}"
|
||||||
|
xy = tuple(tb["box"][0]) # 左上角坐标
|
||||||
|
x_, y_, w, h = ttf.getbbox(text) # 获取宽高。只需要w和h
|
||||||
|
w *= 1.1
|
||||||
|
h *= 1.1
|
||||||
|
draw.rectangle((xy, (xy[0] + w, xy[1] + h)), fill=bg, width=0) # 背景矩形
|
||||||
|
draw.text(xy, text, font=ttf, fill=fill) # 文字
|
||||||
|
return img
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def createContrast(img1, img2):
|
||||||
|
"""左右拼合两个图片,创建对比图层,返回PIL Image对象。"""
|
||||||
|
size = (img1.size[0] + img2.size[0], max(img1.size[1], img2.size[1]))
|
||||||
|
img = Image.new("RGBA", size, 0)
|
||||||
|
img.paste(img1, (0, 0))
|
||||||
|
img.paste(img2, (img1.size[0], 0))
|
||||||
|
return img
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def composite(img1, img2):
|
||||||
|
"""传入两个PIL Image对象(RGBA格式),以img1为底,将img2叠加在其上
|
||||||
|
返回生成的图片"""
|
||||||
|
return Image.alpha_composite(img1, img2)
|
||||||
|
|
||||||
|
# ================================ 快捷接口 ================================
|
||||||
|
|
||||||
|
def __init__(self, textBlocks, imagePath):
|
||||||
|
"""创建可视化对象。\n
|
||||||
|
:textBlocks: 文本块列表,即OCR返回的data部分\n
|
||||||
|
:imagePath: 对应的图片路径。
|
||||||
|
"""
|
||||||
|
self.imgSource = Image.open(imagePath).convert("RGBA") # 原始图片图层
|
||||||
|
self.size = self.imgSource.size
|
||||||
|
self.imgBox = self.createBox(textBlocks, self.size) # 包围盒图层
|
||||||
|
self.imgText = self.createText(textBlocks, self.size) # 文字图层
|
||||||
|
self.imgOrder = self.createOrder(textBlocks, self.size) # 序号图层
|
||||||
|
|
||||||
|
def get(self, isBox=True, isText=False, isOrder=False, isSource=True):
|
||||||
|
"""返回合成可视化结果的PIL Image图像。\n
|
||||||
|
:isBox: T时返回包围盒图层。\n
|
||||||
|
:isText: T时返回文字图层。\n
|
||||||
|
:isOrder: T时返回序号图层。\n
|
||||||
|
:isSource: T时返回原图。F时返回透明背景的纯可视化结果。\n
|
||||||
|
"""
|
||||||
|
img = Image.new("RGBA", self.size, 0)
|
||||||
|
flags = (isSource, isBox, isText, isOrder)
|
||||||
|
for index, im in enumerate(
|
||||||
|
[self.imgSource, self.imgBox, self.imgText, self.imgOrder]
|
||||||
|
):
|
||||||
|
if im and flags[index]:
|
||||||
|
img = visualize.composite(img, im)
|
||||||
|
return img
|
||||||
|
|
||||||
|
def show(self, isBox=True, isText=False, isOrder=False, isSource=True):
|
||||||
|
"""显示可视化结果图像。\n
|
||||||
|
:isBox: T时返回包围盒图层。\n
|
||||||
|
:isText: T时返回文字图层。\n
|
||||||
|
:isOrder: T时返回序号图层。\n
|
||||||
|
:isSource: T时返回原图。F时返回透明背景的纯可视化结果。\n
|
||||||
|
"""
|
||||||
|
img = self.get(isBox, isText, isOrder, isSource)
|
||||||
|
img.show()
|
||||||
|
|
||||||
|
def save(self, path="", isBox=True, isText=False, isOrder=False, isSource=True):
|
||||||
|
"""保存可视化结果图像。\n
|
||||||
|
:path: 保存路径。\n
|
||||||
|
:isBox: T时返回包围盒图层。\n
|
||||||
|
:isText: T时返回文字图层。\n
|
||||||
|
:isOrder: T时返回序号图层。\n
|
||||||
|
:isSource: T时返回原图。F时返回透明背景的纯可视化结果。\n
|
||||||
|
"""
|
||||||
|
img = self.get(isBox, isText, isOrder, isSource)
|
||||||
|
img.save(path)
|
417
flask_app/PaddleOCR/python_api/README.md
Normal file
417
flask_app/PaddleOCR/python_api/README.md
Normal file
@ -0,0 +1,417 @@
|
|||||||
|
# PaddleOCR-json Python API
|
||||||
|
|
||||||
|
使用这份API,可以方便地调用 PaddleOCR-json 。比起Python原生的PaddleOCR库,PaddleOCR-json拥有更好的性能。你可以同时享受C++推理库的高效率和Python的简易开发。
|
||||||
|
|
||||||
|
请先在本项目 [Releases](https://github.com/hiroi-sora/PaddleOCR-json/releases) 中下载OCR引擎二进制程序,然后将 [python api](https://github.com/hiroi-sora/PaddleOCR-json/tree/main/api/python) (当前目录中的所有文件)下载到本地,即可通过python接口调用二进制程序。
|
||||||
|
|
||||||
|
Python API 拥有三大模块:
|
||||||
|
- 基础OCR接口
|
||||||
|
- 结果可视化模块,将OCR结果绘制到图像上并展示或保存。
|
||||||
|
- 文本后处理模块,支持段落合并、竖排文本整理等功能。
|
||||||
|
|
||||||
|
# 基础OCR接口
|
||||||
|
|
||||||
|
```python
|
||||||
|
from PPOCR_api import GetOcrApi
|
||||||
|
```
|
||||||
|
|
||||||
|
### 调用OCR的流程分为三步:
|
||||||
|
1. 初始化OCR引擎进程
|
||||||
|
2. 通过OCR引擎,执行一次或多次识图任务
|
||||||
|
3. 关闭OCR引擎进程
|
||||||
|
|
||||||
|
### 第一步:初始化
|
||||||
|
|
||||||
|
**接口:** `GetOcrApi()`
|
||||||
|
|
||||||
|
**参数:**
|
||||||
|
|
||||||
|
| 名称 | 默认值 | 类型 | 描述 |
|
||||||
|
| ---------- | ------ | ---- | -------------------------------------------------------------- |
|
||||||
|
| exePath | 必填 | str | 引擎二进制文件的路径,或远程服务器地址,见下。 |
|
||||||
|
| modelsPath | None | str | 识别库路径,若为None则默认识别库与引擎在同一目录下。 |
|
||||||
|
| argument | None | dict | 启动参数字典。可以用这个参数指定配置文件、指定识别语言。 |
|
||||||
|
| ipcMode | "pipe" | str | 进程间通信方式,可选值为套接字模式`socket` 或 管道模式`pipe`。 |
|
||||||
|
|
||||||
|
##### 关于 `exePath` :
|
||||||
|
|
||||||
|
当前允许两种调用引擎的模式:
|
||||||
|
1. 引擎部署在本地:
|
||||||
|
- 在 [Releases](https://github.com/hiroi-sora/PaddleOCR-json/releases) 中下载OCR引擎二进制程序到本地,解压。
|
||||||
|
- Windows 平台:`exePath` 传入 `PaddleOCR-json.exe` 的路径。
|
||||||
|
- Linux 平台:`exePath` 传入 `run.sh` 的路径
|
||||||
|
2. 引擎部署在远程:
|
||||||
|
- 在服务器上部署 PaddleOCR-json 程序,启用服务器模式,并确保客户机可以访问服务器。
|
||||||
|
- 客户机:`exePath` 传入 `"remote://ip:port"` 。
|
||||||
|
|
||||||
|
##### 关于 `modelsPath` :
|
||||||
|
|
||||||
|
这个参数的本意是希望能自动处理相对路径在不同的工作路径下出错的问题。API在启动引擎进程时会将工作路径设置在引擎的父文件夹下,如果用户直接传入 `models_path` 路径到参数字典 `argument` 则很容易出现路径错误。而 `modelsPath` 参数则会先将输入的路径以当前的python运行路径为基准转换成绝对路径,之后再用 `models_path` 参数的形式输入给引擎,进而防止路径错误。当然,你也可以通过输入一个新的 `models_path` 参数到 `argument` 字典来覆盖掉这个路径。
|
||||||
|
|
||||||
|
[更多有关 `models_path` 参数的细节请看这里](../../README.md#常用配置参数说明)。
|
||||||
|
|
||||||
|
**返回值:**
|
||||||
|
|
||||||
|
初始化成功,返回引擎API对象。初始化失败或连接远程服务失败,抛出异常。
|
||||||
|
|
||||||
|
**示例1:** 最简单的情况
|
||||||
|
|
||||||
|
```python
|
||||||
|
ocr = GetOcrApi(r"…………\PaddleOCR_json.exe")
|
||||||
|
```
|
||||||
|
|
||||||
|
**示例2:** 指定使用繁体中文识别库(需要先在引擎models目录内放入识别库文件)
|
||||||
|
|
||||||
|
注意,config_path的路径如果是相对路径,则根为PaddleOCR-json.exe所在的路径,而不是Python脚本的路径。
|
||||||
|
|
||||||
|
```python
|
||||||
|
argument = {'config_path': "models/config_chinese_cht.txt"}
|
||||||
|
ocr = GetOcrApi(r"…………\PaddleOCR_json.exe", argument)
|
||||||
|
```
|
||||||
|
|
||||||
|
**示例3:** 指定使用套接字通信方式
|
||||||
|
|
||||||
|
使用管道通信(默认)和套接字通信,在使用上而言是透明的,即调用方法完全一致。
|
||||||
|
|
||||||
|
性能上有微弱的区别,管道的效率略高一点,而套接字TCP在大型数据传输时(如30MB以上的Base64图片数据)可能稳定性略好一些。对于普通用户,使用默认设定即可。
|
||||||
|
|
||||||
|
```python
|
||||||
|
ocr = GetOcrApi(r"…………\PaddleOCR_json.exe", ipcMode="socket")
|
||||||
|
```
|
||||||
|
|
||||||
|
**示例4:** 使用套接字模式连接到远程服务器
|
||||||
|
|
||||||
|
在套接字通信模式下,你可以连接到一个远程的PaddleOCR-json服务器。这样一来就不需要将整套系统部署到同一台机器上了。
|
||||||
|
|
||||||
|
```python
|
||||||
|
ip = '192.168.10.1'
|
||||||
|
port = 1234
|
||||||
|
ocr = GetOcrApi(r"remote://192.168.10.1:1234", ipcMode="socket")
|
||||||
|
```
|
||||||
|
|
||||||
|
这里我们使用一个URI来代替引擎位置,表示服务器的IP和端口。接着用参数 `ipcMode` 来使用套接字模式(不可以用管道模式)。在这种情况下,输入 `argument` 参数不会有任何作用,因为这个python脚本并不会启动引擎进程。
|
||||||
|
|
||||||
|
在这种部署情况下,我们建议你使用方法 `runBase64()` 或者 `runBytes()` 来传输文件,方法 `run()` 的路径传输方式很容易出错。当然,你也可以禁用服务器的[路径传输json命令image_path](../../cpp/README.md#cmake构建参数)。
|
||||||
|
|
||||||
|
### 第二步:识别图片
|
||||||
|
|
||||||
|
Python API 提供了丰富的接口,可以用各种姿势调用OCR。
|
||||||
|
|
||||||
|
#### 1. 识别本地图片
|
||||||
|
|
||||||
|
**方法:** `run()`
|
||||||
|
|
||||||
|
**说明:** 对一张本地图片进行OCR
|
||||||
|
|
||||||
|
**参数:**
|
||||||
|
|
||||||
|
| 名称 | 默认值 | 类型 | 描述 |
|
||||||
|
| ------- | ------ | ---- | ------------------------------------ |
|
||||||
|
| imgPath | 必填 | str | 识别图片的路径,如`D:/test/test.png` |
|
||||||
|
|
||||||
|
**返回值字典:**
|
||||||
|
|
||||||
|
| 键 | 类型 | 描述 |
|
||||||
|
| ---- | ---- | ------------------------------------------------------- |
|
||||||
|
| code | int | 状态码。识别成功且有文字为100。其他情况详见主页README。 |
|
||||||
|
| data | list | 识别成功时,data为OCR结果列表。 |
|
||||||
|
| data | str | 识别失败时,data为错误信息字符串。 |
|
||||||
|
|
||||||
|
**示例:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
res = ocr.run("test.png")
|
||||||
|
print("识别结果:\n", res)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. 识别图片字节流
|
||||||
|
|
||||||
|
**方法:** `runBytes()`
|
||||||
|
|
||||||
|
**说明:** 对一个图片字节流进行OCR。可以通过这个接口识别 PIL Image 或者屏幕截图或者网络下载的图片,全程走内存,而无需先保存到硬盘。
|
||||||
|
|
||||||
|
**参数:**
|
||||||
|
|
||||||
|
| 名称 | 默认值 | 类型 | 描述 |
|
||||||
|
| ---------- | ------ | ----- | ---------- |
|
||||||
|
| imageBytes | 必填 | bytes | 字节流对象 |
|
||||||
|
|
||||||
|
**返回值字典:同上**
|
||||||
|
|
||||||
|
**示例:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
with open("test.png", 'rb') as f: # 获取图片字节流
|
||||||
|
imageBytes = f.read() # 实际使用中,可以联网下载或者截图获取字节流
|
||||||
|
res = ocr.runBytes(imageBytes)
|
||||||
|
print("字节流识别结果:\n", res)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. 识别图片Base64编码字符串
|
||||||
|
|
||||||
|
**方法:** `runBase64()`
|
||||||
|
|
||||||
|
**说明:** 对一个Base64编码字符串进行OCR。
|
||||||
|
|
||||||
|
**参数:**
|
||||||
|
|
||||||
|
| 名称 | 默认值 | 类型 | 描述 |
|
||||||
|
| ----------- | ------ | ---- | ------------------ |
|
||||||
|
| imageBase64 | 必填 | str | Base64编码的字符串 |
|
||||||
|
|
||||||
|
**返回值字典:同上**
|
||||||
|
|
||||||
|
#### 4. 格式化输出OCR结果
|
||||||
|
|
||||||
|
**方法:** `printResult()`
|
||||||
|
|
||||||
|
**说明:** 用于调试,打印一个OCR结果。
|
||||||
|
|
||||||
|
**参数:**
|
||||||
|
|
||||||
|
| 名称 | 默认值 | 类型 | 描述 |
|
||||||
|
| ---- | ------ | ---- | ----------------- |
|
||||||
|
| res | 必填 | dict | 一次OCR的返回结果 |
|
||||||
|
|
||||||
|
**无返回值**
|
||||||
|
|
||||||
|
**示例:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
res = ocr.run("test.png")
|
||||||
|
print("格式化输出:")
|
||||||
|
ocr.printResult(res)
|
||||||
|
```
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>
|
||||||
|
<strong>剪贴板相关接口已弃用,不建议使用</strong>
|
||||||
|
</summary>
|
||||||
|
|
||||||
|
#### 5. 识别剪贴板图片
|
||||||
|
|
||||||
|
**方法:** `runClipboard()`
|
||||||
|
|
||||||
|
**说明:** 对当前剪贴板首位的图片进行OCR
|
||||||
|
|
||||||
|
**无参数**
|
||||||
|
|
||||||
|
**返回值字典:同上**
|
||||||
|
|
||||||
|
**示例:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
res = ocr.runClipboard()
|
||||||
|
print("剪贴板识别结果:\n", res)
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
**方法:** `isClipboardEnabled()`
|
||||||
|
|
||||||
|
**说明:** 检测剪贴板功能是否启用。
|
||||||
|
|
||||||
|
**无参数:**
|
||||||
|
|
||||||
|
**返回值**
|
||||||
|
|
||||||
|
如果剪贴板已启用:`True`
|
||||||
|
|
||||||
|
如果剪贴板未启用:`False`
|
||||||
|
|
||||||
|
**方法:** `getRunningMode()`
|
||||||
|
|
||||||
|
**说明:** 检测PaddleOCR-json引擎的运行模式,本地或远程
|
||||||
|
|
||||||
|
**无参数:**
|
||||||
|
|
||||||
|
**返回值字符串:**
|
||||||
|
|
||||||
|
如果引擎运行在本地:`"local"`
|
||||||
|
|
||||||
|
如果引擎运行在远程:`"remote"`
|
||||||
|
|
||||||
|
|
||||||
|
使用示例详见 [demo1.py](demo1.py)
|
||||||
|
|
||||||
|
### 第三步:关闭OCR引擎进程
|
||||||
|
|
||||||
|
一般情况下,在程序结束或者释放ocr对象时会自动关闭引擎子进程,无需手动管理。
|
||||||
|
|
||||||
|
如果希望手动关闭引擎进程,可以使用 `exit()` 方法。
|
||||||
|
|
||||||
|
**示例:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
ocr.exit()
|
||||||
|
```
|
||||||
|
|
||||||
|
如果需要更换识别语言,则重新创建ocr对象即可,旧的对象析构时也会自动关闭旧引擎进程。
|
||||||
|
|
||||||
|
**示例:**
|
||||||
|
|
||||||
|
```python
|
||||||
|
argument = {'config_path': "语言1.txt"}
|
||||||
|
ocr = GetOcrApi(r"…………\PaddleOCR_json.exe", argument)
|
||||||
|
# TODO: 识别语言1
|
||||||
|
|
||||||
|
argument = {'config_path': "语言2.txt"}
|
||||||
|
ocr = GetOcrApi(r"…………\PaddleOCR_json.exe", argument)
|
||||||
|
# TODO: 识别语言2
|
||||||
|
```
|
||||||
|
|
||||||
|
# 结果可视化模块
|
||||||
|
|
||||||
|
纯Python实现,不依赖PPOCR引擎的C++ opencv可视化模块,避免中文兼容性问题。
|
||||||
|
|
||||||
|
需要PIL图像处理库:`pip install pillow`
|
||||||
|
|
||||||
|
```python
|
||||||
|
from PPOCR_visualize import visualize
|
||||||
|
```
|
||||||
|
|
||||||
|
### 获取文本块
|
||||||
|
|
||||||
|
首先得成功执行一次OCR,获取文本块列表(即`['data']`部分)
|
||||||
|
```python
|
||||||
|
testImg = "D:/test.png"
|
||||||
|
getObj = ocr.run(testImg)
|
||||||
|
if not getObj["code"] == 100:
|
||||||
|
print('识别失败!!')
|
||||||
|
exit()
|
||||||
|
textBlocks = getObj["data"] # 提取文本块数据
|
||||||
|
```
|
||||||
|
|
||||||
|
### 展示结果图片
|
||||||
|
|
||||||
|
只需一行代码,传入文本块和原图片的路径,打开图片浏览窗口
|
||||||
|
```python
|
||||||
|
visualize(textBlocks, testImg).show()
|
||||||
|
```
|
||||||
|
此时程序阻塞,直到关闭图片浏览窗口才继续往下走。
|
||||||
|
|
||||||
|
### 图片保存到本地
|
||||||
|
```python
|
||||||
|
visualize(textBlocks, testImg).save('可视化结果.png')
|
||||||
|
```
|
||||||
|
|
||||||
|
### 获取PIL Image对象
|
||||||
|
```python
|
||||||
|
vis = visualize(textBlocks, testImg)
|
||||||
|
img = vis.get()
|
||||||
|
```
|
||||||
|
|
||||||
|
### 调整显示图层
|
||||||
|
|
||||||
|
以上`show`,`save`,`get`三个接口,均能开启或禁用指定图层:
|
||||||
|
|
||||||
|
- `isBox` T时启用包围盒图层。
|
||||||
|
- `isText` T时启用文字图层。
|
||||||
|
- `isOrder` T时启用序号图层。
|
||||||
|
- `isSource` T时启用原图。F禁用原图,即得到透明背景的纯可视化结果。
|
||||||
|
|
||||||
|
### 左右对比
|
||||||
|
|
||||||
|
传入两个PIL Image对象,返回它们左右拼接而成的新Image
|
||||||
|
```python
|
||||||
|
img_12 = visualize.createContrast(img1, img2)
|
||||||
|
```
|
||||||
|
|
||||||
|
### 调整显示效果(颜色、粗细、字体等)
|
||||||
|
|
||||||
|
导入PIL库,以便操作图片对象
|
||||||
|
```python
|
||||||
|
from PIL import Image
|
||||||
|
```
|
||||||
|
|
||||||
|
接口创建各个图层,传入文本块、要生成的图层大小、自定义参数,然后将各个图层合并
|
||||||
|
|
||||||
|
颜色有关的参数,均可传入6位RGB十六进制码(如`#112233`)或8位RGBA码(最后两位控制透明度,如`#11223344`)
|
||||||
|
```python
|
||||||
|
# 创建各图层
|
||||||
|
img = Image.open(testImg).convert('RGBA') # 原始图片背景图层
|
||||||
|
imgBox = visualize.createBox(textBlocks, img.size, # 包围盒图层
|
||||||
|
outline='#ccaa99aa', width=10)
|
||||||
|
imgText = visualize.createText(textBlocks, img.size, # 文本图层
|
||||||
|
fill='#44556699')
|
||||||
|
# 合并各图层
|
||||||
|
img = visualize.composite(img, imgBox)
|
||||||
|
img = visualize.composite(img, imgText)
|
||||||
|
img.show() # 显示
|
||||||
|
```
|
||||||
|
|
||||||
|
使用示例详见 [demo2.py](demo2.py)
|
||||||
|
|
||||||
|
# 文本后处理 tbpu
|
||||||
|
|
||||||
|
(text block processing unit)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from tbpu import GetParser
|
||||||
|
```
|
||||||
|
|
||||||
|
由 [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) 和 [间隙树排序法](https://github.com/hiroi-sora/GapTree_Sort_Algorithm) 带来的技术。
|
||||||
|
|
||||||
|
OCR返回的结果中,一项包含文字、包围盒、置信度的元素,称为一个“文本块” - text block 。
|
||||||
|
|
||||||
|
文块不一定是完整的一句话或一个段落。反之,一般是零散的文字。一个OCR结果常由多个文块组成,这项文块原始的顺序也不一定符合阅读顺序。
|
||||||
|
|
||||||
|
文块后处理 tbpu 的作用就是:将OCR原始文本块进行处理,调整其顺序、并划分出段落。
|
||||||
|
|
||||||
|
### 方案列表
|
||||||
|
|
||||||
|
| 方案id | 方案名称 |
|
||||||
|
| ------------- | ------------- |
|
||||||
|
| `multi_para` | 多栏-自然段 |
|
||||||
|
| `multi_line` | 多栏-总是换行 |
|
||||||
|
| `multi_none` | 多栏-无换行 |
|
||||||
|
| `single_para` | 单栏-自然段 |
|
||||||
|
| `single_line` | 单栏-总是换行 |
|
||||||
|
| `single_none` | 单栏-无换行 |
|
||||||
|
| `single_code` | 单栏-代码段 |
|
||||||
|
|
||||||
|
也可以在 [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) 中直观地体验这些方案的作用。
|
||||||
|
|
||||||
|
通过 `GetParser("方案id")` 来获取对应方案的后处理解析器对象。通过`run()`接口调用解析,并传入OCR结果列表,得到处理后的新列表,见下。
|
||||||
|
|
||||||
|
### 使用
|
||||||
|
|
||||||
|
向接口传入文本块列表(即`['data']`部分),返回新的文本块列表。
|
||||||
|
```python
|
||||||
|
from tbpu import GetParser
|
||||||
|
|
||||||
|
textBlocks = getObj["data"]
|
||||||
|
|
||||||
|
# 获取“多栏-自然段”排版解析器对象
|
||||||
|
parser = GetParser("multi_para")
|
||||||
|
# 传入OCR结果列表,返回新的文本块列表
|
||||||
|
textBlocksNew = parser.run(textBlocks)
|
||||||
|
```
|
||||||
|
|
||||||
|
- 执行后,原列表 textBlocks 的结构可能被破坏,不要再使用原列表(或先深拷贝备份)。
|
||||||
|
- 新文本块列表 textBlocksNew 中,每个文本块的顺序会根据所选方案重新排序。
|
||||||
|
- 同时,textBlocksNew每个文本块中会增加键值 `["end"]` ,表示这个文本块的结尾符(即与下一个文本块的间隔符号)是什么。以 `multi_para` 为例:
|
||||||
|
- 假如一个文本块位于一个自然段的段尾,则 `["end"]=="\n"` 。
|
||||||
|
- 假如位于自然段的中间,且上下文为中文,则 `["end"]==""` 。
|
||||||
|
- 假如位于自然段的中间,且上下文为英文,则 `["end"]==" "` 。
|
||||||
|
|
||||||
|
跟结果可视化配合使用:
|
||||||
|
```python
|
||||||
|
from tbpu import GetParser
|
||||||
|
|
||||||
|
# OCR原始结果 可视化
|
||||||
|
textBlocks = getObj["data"]
|
||||||
|
img1 = visualize(textBlocks, testImg).get(isOrder=True)
|
||||||
|
|
||||||
|
# 执行文本块后处理:多栏-自然段
|
||||||
|
parser = GetParser("multi_para")
|
||||||
|
textBlocksNew = parser.run(textBlocks)
|
||||||
|
|
||||||
|
# 后处理结果 可视化
|
||||||
|
img2 = visualize(textBlocksNew, testImg).get(isOrder=True)
|
||||||
|
|
||||||
|
# 左右拼接图片并展示
|
||||||
|
visualize.createContrast(img1, img2).show()
|
||||||
|
```
|
||||||
|
|
||||||
|
使用示例详见 [demo3.py](demo3.py)
|
65
flask_app/PaddleOCR/python_api/demo1.py
Normal file
65
flask_app/PaddleOCR/python_api/demo1.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
# 👉 demo1.py :演示OCR基础功能
|
||||||
|
# demo2.py :演示可视化接口
|
||||||
|
# demo3.py :演示OCR文段后处理(段落合并)接口
|
||||||
|
|
||||||
|
from PPOCR_api import GetOcrApi
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
# 测试图片路径
|
||||||
|
TestImagePath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test.jpg")
|
||||||
|
|
||||||
|
# 初始化识别器对象,传入 PaddleOCR-json 引擎路径。
|
||||||
|
# 引擎下载地址: https://github.com/hiroi-sora/PaddleOCR-json/releases
|
||||||
|
# Windows: 传入 PaddleOCR-json.exe 的路径。
|
||||||
|
# Linux: 传入 run.sh 的路径
|
||||||
|
ocr = GetOcrApi(r"Your Path/PaddleOCR-json.exe")
|
||||||
|
|
||||||
|
if ocr.getRunningMode() == "local":
|
||||||
|
print(f"初始化OCR成功,进程号为{ocr.ret.pid}")
|
||||||
|
elif ocr.getRunningMode() == "remote":
|
||||||
|
print(f"连接远程OCR引擎成功,ip:{ocr.ip},port:{ocr.port}")
|
||||||
|
print(f"\n测试图片路径:{TestImagePath}")
|
||||||
|
|
||||||
|
# 示例1:识别本地图片
|
||||||
|
res = ocr.run(TestImagePath)
|
||||||
|
print(f"\n示例1-图片路径识别结果(原始信息):\n{res}")
|
||||||
|
print(f"\n示例1-图片路径识别结果(格式化输出):")
|
||||||
|
ocr.printResult(res)
|
||||||
|
|
||||||
|
# 示例2:识别图片字节流
|
||||||
|
with open(TestImagePath, "rb") as f: # 获取图片字节流
|
||||||
|
# 实际使用中,可以联网下载或者截图获取字节流,直接送入OCR,无需保存到本地中转。
|
||||||
|
imageBytes = f.read()
|
||||||
|
res = ocr.runBytes(imageBytes)
|
||||||
|
print(f"\n示例2-字节流识别结果:")
|
||||||
|
ocr.printResult(res)
|
||||||
|
|
||||||
|
# 示例3:识别 PIL Image 对象
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
from io import BytesIO
|
||||||
|
except Exception:
|
||||||
|
print("安装Pillow库后方可测试示例3。")
|
||||||
|
Image = None
|
||||||
|
if Image:
|
||||||
|
# 创建一个PIL Image对象
|
||||||
|
pilImage = Image.open(TestImagePath)
|
||||||
|
# Image 对象转为 字节流
|
||||||
|
buffered = BytesIO()
|
||||||
|
pilImage.save(buffered, format="PNG")
|
||||||
|
imageBytes = buffered.getvalue()
|
||||||
|
# 送入OCR
|
||||||
|
res = ocr.runBytes(imageBytes)
|
||||||
|
print(f"\n示例3-PIL Image 识别结果:")
|
||||||
|
ocr.printResult(res)
|
||||||
|
|
||||||
|
# 以下示例默认禁用
|
||||||
|
# 示例4:识别剪贴板图片
|
||||||
|
if ocr.isClipboardEnabled():
|
||||||
|
res = ocr.runClipboard()
|
||||||
|
if res["code"] == 212:
|
||||||
|
print(f"\n示例4-当前剪贴板中没有图片。")
|
||||||
|
else:
|
||||||
|
print(f"\n示例4-剪贴板识别结果:")
|
||||||
|
ocr.printResult(res)
|
55
flask_app/PaddleOCR/python_api/demo2.py
Normal file
55
flask_app/PaddleOCR/python_api/demo2.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
# demo1.py :演示OCR基础功能
|
||||||
|
# 👉 demo2.py :演示可视化接口
|
||||||
|
# demo3.py :演示OCR文段后处理(段落合并)接口
|
||||||
|
|
||||||
|
from PPOCR_api import GetOcrApi
|
||||||
|
from PPOCR_visualize import visualize
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
# 测试图片路径
|
||||||
|
TestImagePath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test.jpg")
|
||||||
|
|
||||||
|
# 初始化识别器对象,传入 PaddleOCR-json 引擎路径。
|
||||||
|
ocr = GetOcrApi(r"Your Path/PaddleOCR-json.exe")
|
||||||
|
|
||||||
|
if ocr.getRunningMode() == "local":
|
||||||
|
print(f"初始化OCR成功,进程号为{ocr.ret.pid}")
|
||||||
|
elif ocr.getRunningMode() == "remote":
|
||||||
|
print(f"连接远程OCR引擎成功,ip:{ocr.ip},port:{ocr.port}")
|
||||||
|
print(f"\n测试图片路径:{TestImagePath}")
|
||||||
|
|
||||||
|
|
||||||
|
# OCR识别图片,获取文本块
|
||||||
|
getObj = ocr.run(TestImagePath)
|
||||||
|
ocr.exit() # 结束引擎子进程
|
||||||
|
if not getObj["code"] == 100:
|
||||||
|
print("识别失败!!")
|
||||||
|
exit()
|
||||||
|
textBlocks = getObj["data"] # 提取文本块数据
|
||||||
|
|
||||||
|
# 可视化演示
|
||||||
|
|
||||||
|
# 示例1:传入文本块和图片路径,显示结果
|
||||||
|
print("显示图片!")
|
||||||
|
visualize(textBlocks, TestImagePath).show()
|
||||||
|
# 程序阻塞,直到关闭图片浏览窗口才继续往下走。如果长时间不动,注释掉上面这行再跑
|
||||||
|
|
||||||
|
# 示例2:显示更详细的信息
|
||||||
|
vis = visualize(textBlocks, TestImagePath)
|
||||||
|
print("获取图片!")
|
||||||
|
# 禁用包围盒,获取原图片的 PIL Image 对象
|
||||||
|
visImg1 = vis.get(isBox=False)
|
||||||
|
# 启用文本和序号、禁用原图(显示透明背景),获取 PIL Image 对象
|
||||||
|
visImg2 = vis.get(isText=True, isOrder=True, isSource=False)
|
||||||
|
# 获取两个图片的左右对比,左边是原图,右边是单独的文本框
|
||||||
|
vis = visualize.createContrast(visImg1, visImg2)
|
||||||
|
# 显示该对比
|
||||||
|
vis.show()
|
||||||
|
# 接下来可以还用PIL库对visImg进一步处理。
|
||||||
|
|
||||||
|
# 保存到本地
|
||||||
|
print(f"保存图片到 {os.path.dirname(os.path.abspath(__file__))}\\可视化结果.png ")
|
||||||
|
vis.save(f"{os.path.dirname(os.path.abspath(__file__))}\\可视化结果.png", isText=True)
|
||||||
|
|
||||||
|
print("程序结束。")
|
49
flask_app/PaddleOCR/python_api/demo3.py
Normal file
49
flask_app/PaddleOCR/python_api/demo3.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# demo1.py :演示OCR基础功能
|
||||||
|
# demo2.py :演示可视化接口
|
||||||
|
# 👉 demo3.py :演示OCR文段后处理(段落合并)接口
|
||||||
|
|
||||||
|
from PPOCR_api import GetOcrApi
|
||||||
|
from PPOCR_visualize import visualize # 可视化
|
||||||
|
from tbpu import GetParser # 获取排版解析器的接口
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
# 测试图片路径
|
||||||
|
TestImagePath = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test.jpg")
|
||||||
|
|
||||||
|
# 初始化识别器对象,传入 PaddleOCR-json 引擎路径。
|
||||||
|
ocr = GetOcrApi(r"Your Path/PaddleOCR-json.exe")
|
||||||
|
|
||||||
|
if ocr.getRunningMode() == "local":
|
||||||
|
print(f"初始化OCR成功,进程号为{ocr.ret.pid}")
|
||||||
|
elif ocr.getRunningMode() == "remote":
|
||||||
|
print(f"连接远程OCR引擎成功,ip:{ocr.ip},port:{ocr.port}")
|
||||||
|
print(f"\n测试图片路径:{TestImagePath}")
|
||||||
|
|
||||||
|
# OCR识别图片,获取文本块
|
||||||
|
getObj = ocr.run(TestImagePath)
|
||||||
|
ocr.exit() # 结束引擎子进程
|
||||||
|
if not getObj["code"] == 100:
|
||||||
|
print("识别失败!!")
|
||||||
|
exit()
|
||||||
|
textBlocks = getObj["data"] # 提取文本块数据
|
||||||
|
|
||||||
|
# OCR原始结果的可视化Image
|
||||||
|
img1 = visualize(textBlocks, TestImagePath).get(isOrder=True)
|
||||||
|
ocr.exit() # 结束引擎子进程
|
||||||
|
print("========== 原始结果 ==========")
|
||||||
|
ocr.printResult(getObj)
|
||||||
|
|
||||||
|
# 获取排版解析器对象
|
||||||
|
parser = GetParser("multi_para")
|
||||||
|
# 传入OCR结果列表,返回新的文本块列表
|
||||||
|
textBlocksNew = parser.run(textBlocks)
|
||||||
|
# 注意,处理后原列表 textBlocks 的结构可能被破坏,不要再使用原列表(或先深拷贝备份)。
|
||||||
|
print("========== 整理后结果 ==========")
|
||||||
|
getObj["data"] = textBlocksNew
|
||||||
|
ocr.printResult(getObj)
|
||||||
|
|
||||||
|
# 可视化 后处理结果的可视化Image
|
||||||
|
img2 = visualize(textBlocksNew, TestImagePath).get(isOrder=True)
|
||||||
|
print("显示可视化结果。左边是原始结果,右边是合并自然段后的结果。")
|
||||||
|
visualize.createContrast(img1, img2).show() # 左右拼接图片并展示
|
29
flask_app/PaddleOCR/python_api/tbpu/__init__.py
Normal file
29
flask_app/PaddleOCR/python_api/tbpu/__init__.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# tbpu : text block processing unit 文本块后处理
|
||||||
|
|
||||||
|
from .tbpu import Tbpu
|
||||||
|
from .parser_none import ParserNone
|
||||||
|
from .parser_multi_para import MultiPara
|
||||||
|
from .parser_multi_line import MultiLine
|
||||||
|
from .parser_multi_none import MultiNone
|
||||||
|
from .parser_single_para import SinglePara
|
||||||
|
from .parser_single_line import SingleLine
|
||||||
|
from .parser_single_none import SingleNone
|
||||||
|
from .parser_single_code import SingleCode
|
||||||
|
|
||||||
|
# 排版解析
|
||||||
|
Parser = {
|
||||||
|
"none": ParserNone, # 不做处理
|
||||||
|
"multi_para": MultiPara, # 多栏-自然段
|
||||||
|
"multi_line": MultiLine, # 多栏-总是换行
|
||||||
|
"multi_none": MultiNone, # 多栏-无换行
|
||||||
|
"single_para": SinglePara, # 单栏-自然段
|
||||||
|
"single_line": SingleLine, # 单栏-总是换行
|
||||||
|
"single_none": SingleNone, # 单栏-无换行
|
||||||
|
"single_code": SingleCode, # 单栏-代码段
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# 获取排版解析器对象
|
||||||
|
def GetParser(key) -> Tbpu:
|
||||||
|
if key in Parser:
|
||||||
|
return Parser[key]()
|
22
flask_app/PaddleOCR/python_api/tbpu/parser_multi_line.py
Normal file
22
flask_app/PaddleOCR/python_api/tbpu/parser_multi_line.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# 排版解析-多栏-单行
|
||||||
|
|
||||||
|
from .tbpu import Tbpu
|
||||||
|
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||||
|
from .parser_tools.gap_tree import GapTree # 间隙树排序算法
|
||||||
|
|
||||||
|
|
||||||
|
class MultiLine(Tbpu):
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "排版解析-多栏-单行"
|
||||||
|
|
||||||
|
# 构建算法对象,指定包围盒的元素位置
|
||||||
|
self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||||
|
textBlocks = self.gtree.sort(textBlocks) # 构建间隙树
|
||||||
|
# 补充行尾间隔符
|
||||||
|
for tb in textBlocks:
|
||||||
|
tb["end"] = "\n"
|
||||||
|
del tb["normalized_bbox"]
|
||||||
|
return textBlocks
|
29
flask_app/PaddleOCR/python_api/tbpu/parser_multi_none.py
Normal file
29
flask_app/PaddleOCR/python_api/tbpu/parser_multi_none.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# 排版解析-多栏-无换行
|
||||||
|
|
||||||
|
from .tbpu import Tbpu
|
||||||
|
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||||
|
from .parser_tools.gap_tree import GapTree # 间隙树排序算法
|
||||||
|
from .parser_tools.paragraph_parse import word_separator # 上下句间隔符
|
||||||
|
|
||||||
|
|
||||||
|
class MultiNone(Tbpu):
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "排版解析-多栏-无换行"
|
||||||
|
|
||||||
|
# 构建算法对象,指定包围盒的元素位置
|
||||||
|
self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||||
|
textBlocks = self.gtree.sort(textBlocks) # 构建间隙树
|
||||||
|
# 补充行尾间隔符
|
||||||
|
for i in range(len(textBlocks)):
|
||||||
|
tb = textBlocks[i]
|
||||||
|
if i < len(textBlocks) - 1:
|
||||||
|
letter1 = tb["text"][-1] # 行1结尾字母
|
||||||
|
letter2 = textBlocks[i + 1]["text"][0] # 行2开头字母
|
||||||
|
tb["end"] = word_separator(letter1, letter2) # 获取间隔符
|
||||||
|
else:
|
||||||
|
tb["end"] = "\n"
|
||||||
|
del tb["normalized_bbox"]
|
||||||
|
return textBlocks
|
33
flask_app/PaddleOCR/python_api/tbpu/parser_multi_para.py
Normal file
33
flask_app/PaddleOCR/python_api/tbpu/parser_multi_para.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
# 排版解析-多栏-自然段
|
||||||
|
|
||||||
|
from .tbpu import Tbpu
|
||||||
|
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||||
|
from .parser_tools.gap_tree import GapTree # 间隙树排序算法
|
||||||
|
from .parser_tools.paragraph_parse import ParagraphParse # 段内分析器
|
||||||
|
|
||||||
|
|
||||||
|
class MultiPara(Tbpu):
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "排版解析-多栏-自然段"
|
||||||
|
|
||||||
|
# 间隙树对象
|
||||||
|
self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
|
||||||
|
|
||||||
|
# 段内分析器对象
|
||||||
|
get_info = lambda tb: (tb["normalized_bbox"], tb["text"])
|
||||||
|
|
||||||
|
def set_end(tb, end): # 获取预测的块尾分隔符
|
||||||
|
tb["end"] = end
|
||||||
|
|
||||||
|
self.pp = ParagraphParse(get_info, set_end)
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||||
|
textBlocks = self.gtree.sort(textBlocks) # 构建间隙树
|
||||||
|
nodes = self.gtree.get_nodes_text_blocks() # 获取树节点序列
|
||||||
|
# 对每个结点,进行自然段分析
|
||||||
|
for tbs in nodes:
|
||||||
|
self.pp.run(tbs) # 预测结尾分隔符
|
||||||
|
for tb in tbs:
|
||||||
|
del tb["normalized_bbox"]
|
||||||
|
return textBlocks
|
14
flask_app/PaddleOCR/python_api/tbpu/parser_none.py
Normal file
14
flask_app/PaddleOCR/python_api/tbpu/parser_none.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# 排版解析-不做处理
|
||||||
|
|
||||||
|
from .tbpu import Tbpu
|
||||||
|
|
||||||
|
|
||||||
|
class ParserNone(Tbpu):
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "排版解析-不做处理"
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
for tb in textBlocks:
|
||||||
|
if "end" not in tb:
|
||||||
|
tb["end"] = "\n" # 默认结尾间隔符为换行
|
||||||
|
return textBlocks
|
74
flask_app/PaddleOCR/python_api/tbpu/parser_single_code.py
Normal file
74
flask_app/PaddleOCR/python_api/tbpu/parser_single_code.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
# 排版解析-单栏-代码段
|
||||||
|
|
||||||
|
from .parser_single_line import SingleLine
|
||||||
|
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||||
|
|
||||||
|
from bisect import bisect_left
|
||||||
|
|
||||||
|
|
||||||
|
class SingleCode(SingleLine):
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "排版解析-单栏-代码段"
|
||||||
|
|
||||||
|
def merge_line(self, line): # 合并一行
|
||||||
|
A = line[0]
|
||||||
|
ba = A["box"]
|
||||||
|
ha = ba[3][1] - ba[0][1] # 块A行高
|
||||||
|
score = A["score"]
|
||||||
|
for i in range(1, len(line)):
|
||||||
|
B = line[i]
|
||||||
|
bb = B["box"]
|
||||||
|
ha = (ha + bb[3][1] - bb[0][1]) / 2
|
||||||
|
# 合并文字,补充与间距相同的空格数
|
||||||
|
space = 0
|
||||||
|
if bb[0][0] > ba[1][0]:
|
||||||
|
space = round((bb[0][0] - ba[1][0]) / ha)
|
||||||
|
A["text"] += " " * space + B["text"]
|
||||||
|
print(space, bb[0][0], ba[1][0])
|
||||||
|
# 合并包围盒
|
||||||
|
yTop = min(ba[0][1], ba[1][1], bb[0][1], bb[1][1])
|
||||||
|
yBottom = max(ba[2][1], ba[3][1], bb[2][1], bb[3][1])
|
||||||
|
xLeft = min(ba[0][0], ba[3][0], bb[0][0], bb[3][0])
|
||||||
|
xRight = max(ba[1][0], ba[2][0], bb[1][0], bb[2][0])
|
||||||
|
ba[0][1] = ba[1][1] = yTop # y上
|
||||||
|
ba[2][1] = ba[3][1] = yBottom # y下
|
||||||
|
ba[0][0] = ba[3][0] = xLeft # x左
|
||||||
|
ba[1][0] = ba[2][0] = xRight # x右
|
||||||
|
# 置信度
|
||||||
|
score += B["score"]
|
||||||
|
A["score"] = score / len(line)
|
||||||
|
del A["normalized_bbox"]
|
||||||
|
A["end"] = "\n"
|
||||||
|
return A
|
||||||
|
|
||||||
|
def indent(self, tbs): # 分析所有行,构造缩进
|
||||||
|
lh = 0 # 平均行高
|
||||||
|
xMin = float("inf") # 句首的最左、最右x值
|
||||||
|
xMax = float("-inf")
|
||||||
|
for tb in tbs:
|
||||||
|
b = tb["box"]
|
||||||
|
lh += b[3][1] - b[0][1]
|
||||||
|
x = b[0][0]
|
||||||
|
xMin = min(xMin, x)
|
||||||
|
xMax = max(xMax, x)
|
||||||
|
lh /= len(tbs)
|
||||||
|
lh2 = lh / 2
|
||||||
|
# 构建缩进层级列表
|
||||||
|
levelList = []
|
||||||
|
x = xMin
|
||||||
|
while x < xMax:
|
||||||
|
levelList.append(x)
|
||||||
|
x += lh
|
||||||
|
# 按照层级,为每行句首加上空格,并调整包围盒
|
||||||
|
for tb in tbs:
|
||||||
|
b = tb["box"]
|
||||||
|
level = bisect_left(levelList, b[0][0] + lh2) - 1 # 二分查找层级点
|
||||||
|
tb["text"] = " " * level + tb["text"] # 补充空格
|
||||||
|
b[0][0] = b[3][0] = xMin # 左侧归零
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||||
|
lines = self.get_lines(textBlocks) # 获取每一行
|
||||||
|
tbs = [self.merge_line(line) for line in lines] # 合并所有行
|
||||||
|
self.indent(tbs) # 为每行添加句首缩进
|
||||||
|
return tbs
|
73
flask_app/PaddleOCR/python_api/tbpu/parser_single_line.py
Normal file
73
flask_app/PaddleOCR/python_api/tbpu/parser_single_line.py
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
# 排版解析-单栏-单行
|
||||||
|
|
||||||
|
from .tbpu import Tbpu
|
||||||
|
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||||
|
from .parser_tools.paragraph_parse import word_separator # 上下句间隔符
|
||||||
|
|
||||||
|
|
||||||
|
class SingleLine(Tbpu):
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "排版解析-单栏-单行"
|
||||||
|
|
||||||
|
# 从文本块列表中找出所有行
|
||||||
|
def get_lines(self, textBlocks):
|
||||||
|
# 按x排序
|
||||||
|
textBlocks.sort(key=lambda tb: tb["normalized_bbox"][0])
|
||||||
|
lines = []
|
||||||
|
for i1, tb1 in enumerate(textBlocks):
|
||||||
|
if not tb1:
|
||||||
|
continue
|
||||||
|
# 最左的一个块
|
||||||
|
l1, top1, r1, bottom1 = tb1["normalized_bbox"]
|
||||||
|
h1 = bottom1 - top1
|
||||||
|
now_line = [tb1]
|
||||||
|
# 考察右侧哪些块符合条件
|
||||||
|
for i2 in range(i1 + 1, len(textBlocks)):
|
||||||
|
tb2 = textBlocks[i2]
|
||||||
|
if not tb2:
|
||||||
|
continue
|
||||||
|
l2, top2, r2, bottom2 = tb2["normalized_bbox"]
|
||||||
|
h2 = bottom2 - top2
|
||||||
|
# 行2左侧太前
|
||||||
|
if l2 < r1 - h1:
|
||||||
|
continue
|
||||||
|
# 垂直距离太远
|
||||||
|
if top2 < top1 - h1 * 0.5 or bottom2 > bottom1 + h1 * 0.5:
|
||||||
|
continue
|
||||||
|
# 行高差距过大
|
||||||
|
if abs(h1 - h2) > min(h1, h2) * 0.5:
|
||||||
|
continue
|
||||||
|
# 符合条件
|
||||||
|
now_line.append(tb2)
|
||||||
|
textBlocks[i2] = None
|
||||||
|
# 更新搜索条件
|
||||||
|
r1 = r2
|
||||||
|
# 处理完一行
|
||||||
|
for i2 in range(len(now_line) - 1):
|
||||||
|
# 检查同一行内相邻文本块的水平间隙
|
||||||
|
l1, t1, r1, b1 = now_line[i2]["normalized_bbox"]
|
||||||
|
l2, t2, r2, b2 = now_line[i2 + 1]["normalized_bbox"]
|
||||||
|
h = (b1 + b2 - t1 - l2) * 0.5
|
||||||
|
if l2 - r1 > h * 1.5: # 间隙太大,强制设置空格
|
||||||
|
now_line[i2]["end"] = " "
|
||||||
|
continue
|
||||||
|
letter1 = now_line[i2]["text"][-1]
|
||||||
|
letter2 = now_line[i2 + 1]["text"][0]
|
||||||
|
now_line[i2]["end"] = word_separator(letter1, letter2)
|
||||||
|
now_line[-1]["end"] = "\n"
|
||||||
|
lines.append(now_line)
|
||||||
|
textBlocks[i1] = None
|
||||||
|
# 所有行按y排序
|
||||||
|
lines.sort(key=lambda tbs: tbs[0]["normalized_bbox"][1])
|
||||||
|
return lines
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||||
|
lines = self.get_lines(textBlocks) # 获取每一行
|
||||||
|
# 解包
|
||||||
|
textBlocks = []
|
||||||
|
for line in lines:
|
||||||
|
for tb in line:
|
||||||
|
del tb["normalized_bbox"]
|
||||||
|
textBlocks.append(tb)
|
||||||
|
return textBlocks
|
19
flask_app/PaddleOCR/python_api/tbpu/parser_single_none.py
Normal file
19
flask_app/PaddleOCR/python_api/tbpu/parser_single_none.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
# 排版解析-单栏-无换行
|
||||||
|
|
||||||
|
from .parser_single_line import SingleLine
|
||||||
|
from .parser_tools.paragraph_parse import word_separator # 上下句间隔符
|
||||||
|
|
||||||
|
|
||||||
|
class SingleNone(SingleLine):
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "排版解析-单栏-无换行"
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
textBlocks = super().run(textBlocks)
|
||||||
|
# 找到换行符,更改为间隔符
|
||||||
|
for i in range(len(textBlocks) - 1):
|
||||||
|
if textBlocks[i]["end"] == "\n":
|
||||||
|
letter1 = textBlocks[i]["text"][-1]
|
||||||
|
letter2 = textBlocks[i + 1]["text"][0]
|
||||||
|
textBlocks[i]["end"] = word_separator(letter1, letter2)
|
||||||
|
return textBlocks
|
49
flask_app/PaddleOCR/python_api/tbpu/parser_single_para.py
Normal file
49
flask_app/PaddleOCR/python_api/tbpu/parser_single_para.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# 排版解析-单栏-自然段
|
||||||
|
|
||||||
|
from .parser_single_line import SingleLine
|
||||||
|
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
||||||
|
from .parser_tools.paragraph_parse import ParagraphParse # 段内分析器
|
||||||
|
|
||||||
|
|
||||||
|
class SinglePara(SingleLine):
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "排版解析-单栏-自然段"
|
||||||
|
|
||||||
|
# 段内分析器对象
|
||||||
|
get_info = lambda tb: (tb["normalized_bbox"], tb["text"])
|
||||||
|
|
||||||
|
def set_end(tb, end): # 获取预测的块尾分隔符
|
||||||
|
tb["line"][-1]["end"] = end
|
||||||
|
|
||||||
|
self.pp = ParagraphParse(get_info, set_end)
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
textBlocks = linePreprocessing(textBlocks) # 预处理
|
||||||
|
lines = self.get_lines(textBlocks) # 获取每一行
|
||||||
|
# 将行封装为tb
|
||||||
|
temp_tbs = []
|
||||||
|
for line in lines:
|
||||||
|
b0, b1, b2, b3 = line[0]["normalized_bbox"]
|
||||||
|
# 搜索bbox
|
||||||
|
for i in range(1, len(line)):
|
||||||
|
bb = line[i]["normalized_bbox"]
|
||||||
|
b1 = min(b1, bb[1])
|
||||||
|
b2 = max(b1, bb[2])
|
||||||
|
b3 = max(b1, bb[3])
|
||||||
|
# 构建tb
|
||||||
|
temp_tbs.append(
|
||||||
|
{
|
||||||
|
"normalized_bbox": (b0, b1, b2, b3),
|
||||||
|
"text": line[0]["text"][0] + line[-1]["text"][-1],
|
||||||
|
"line": line,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# 预测结尾分隔符
|
||||||
|
self.pp.run(temp_tbs)
|
||||||
|
# 解包
|
||||||
|
textBlocks = []
|
||||||
|
for t in temp_tbs:
|
||||||
|
for tb in t["line"]:
|
||||||
|
del tb["normalized_bbox"]
|
||||||
|
textBlocks.append(tb)
|
||||||
|
return textBlocks
|
330
flask_app/PaddleOCR/python_api/tbpu/parser_tools/gap_tree.py
Normal file
330
flask_app/PaddleOCR/python_api/tbpu/parser_tools/gap_tree.py
Normal file
@ -0,0 +1,330 @@
|
|||||||
|
# 【间隙·树·排序算法】 GapTree_Sort_Algorithm
|
||||||
|
# 对OCR结果或PDF提取的文本进行版面分析,按人类阅读顺序进行排序。
|
||||||
|
# Author: hiroi-sora
|
||||||
|
# https://github.com/hiroi-sora/GapTree_Sort_Algorithm
|
||||||
|
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
|
||||||
|
class GapTree:
|
||||||
|
def __init__(self, get_bbox: Callable):
|
||||||
|
"""
|
||||||
|
:param get_bbox: 函数,传入单个文本块,
|
||||||
|
返回该文本块左上角、右下角的坐标元组 (x0, y0, x1, y1)
|
||||||
|
"""
|
||||||
|
self.get_bbox = get_bbox
|
||||||
|
|
||||||
|
# ======================= 调用接口 =====================
|
||||||
|
# 对文本块列表排序
|
||||||
|
def sort(self, text_blocks: list):
|
||||||
|
"""
|
||||||
|
对文本块列表,按人类阅读顺序进行排序。
|
||||||
|
|
||||||
|
:param text_blocks: 文本块对象列表
|
||||||
|
:return: 排序后的文本块列表
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 封装块单元,并求页面左右边缘
|
||||||
|
units, page_l, page_r = self._get_units(text_blocks, self.get_bbox)
|
||||||
|
# 求行和竖切线
|
||||||
|
cuts, rows = self._get_cuts_rows(units, page_l, page_r)
|
||||||
|
# 求布局树
|
||||||
|
root = self._get_layout_tree(cuts, rows)
|
||||||
|
# 求树节点序列
|
||||||
|
nodes = self._preorder_traversal(root)
|
||||||
|
# 求排序后的 原始文本块序列
|
||||||
|
new_text_blocks = self._get_text_blocks(nodes)
|
||||||
|
|
||||||
|
# 测试:缓存中间变量,以便调试输出
|
||||||
|
self.current_rows = rows
|
||||||
|
self.current_cuts = cuts
|
||||||
|
self.current_nodes = nodes
|
||||||
|
|
||||||
|
return new_text_blocks
|
||||||
|
|
||||||
|
# 获取以区块为单位的文本块二层列表
|
||||||
|
def get_nodes_text_blocks(self):
|
||||||
|
"""
|
||||||
|
获取以区块为单位的文本块二层列表。需要在 sort 后调用。
|
||||||
|
|
||||||
|
:return: [ [区块1的text_blocks], [区块2的text_blocks]... ]
|
||||||
|
"""
|
||||||
|
result = []
|
||||||
|
for node in self.current_nodes:
|
||||||
|
tbs = []
|
||||||
|
if node["units"]:
|
||||||
|
for unit in node["units"]:
|
||||||
|
tbs.append(unit[1])
|
||||||
|
result.append(tbs)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ======================= 封装块单元列表 =====================
|
||||||
|
# 将原始文本块,封装为 ( (x0,y0,x2,y2), 原始 ) 。并检查页边界。
|
||||||
|
def _get_units(self, text_blocks, get_bbox):
|
||||||
|
# 封装单元列表 units [ ( (x0,y0,x2,y2), 原始文本块 ), ... ]
|
||||||
|
units = []
|
||||||
|
page_l, page_r = float("inf"), -1 # 记录文本块的左右最值,作为页边界
|
||||||
|
for tb in text_blocks:
|
||||||
|
x0, y0, x2, y2 = get_bbox(tb)
|
||||||
|
units.append(((x0, y0, x2, y2), tb))
|
||||||
|
if x0 < page_l:
|
||||||
|
page_l = x0
|
||||||
|
if x2 > page_r:
|
||||||
|
page_r = x2
|
||||||
|
units.sort(key=lambda a: a[0][1]) # 按顶部从上到下排序
|
||||||
|
return units, page_l, page_r
|
||||||
|
|
||||||
|
# ======================= 求行和竖切线 =====================
|
||||||
|
"""
|
||||||
|
扫描所有文本块,获取所有行和竖切线。
|
||||||
|
一个行,由一组垂直位置接近的文本块所组成。
|
||||||
|
一条竖切线,由多个连续行中,同一位置的间隙所组成。间隙划分同一行中不同列的文本块。
|
||||||
|
输入:一个页面上的文本块单元列表 units=[ ( (x0,y0,x2,y2), _ ) ] 。必须按上到下排序。
|
||||||
|
返回:
|
||||||
|
竖切线列表 cuts=[ ( 左边缘x, 右边缘x, 起始行号, 结束行号 ) ] 。从左到右排序
|
||||||
|
页面上的行 rows=[ [unit...] ] 。从上到下,从左到右排序
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_cuts_rows(self, units, page_l, page_r):
|
||||||
|
# 使用间隙组 gaps2 更新 gaps1 。返回: 更新后的gaps1 , gaps1中被移除的间隙
|
||||||
|
def update_gaps(gaps1, gaps2):
|
||||||
|
flags1 = [True for _ in gaps1] # gaps1[i] 是否彻底移除
|
||||||
|
flags2 = [True for _ in gaps2] # gaps2[i] 是否新加入
|
||||||
|
new_gaps1 = []
|
||||||
|
for i1, g1 in enumerate(gaps1):
|
||||||
|
l1, r1, _ = g1
|
||||||
|
for i2, g2 in enumerate(gaps2): # 对每一个gap1,考察所有gap2
|
||||||
|
l2, r2, _ = g2
|
||||||
|
# 计算交集的起点和终点
|
||||||
|
inter_l = max(l1, l2)
|
||||||
|
inter_r = min(r1, r2)
|
||||||
|
# 如果交集有效
|
||||||
|
if inter_l <= inter_r:
|
||||||
|
# 更新 gap1 左右边缘
|
||||||
|
new_gaps1.append((inter_l, inter_r, g1[2]))
|
||||||
|
flags1[i1] = False # 旧的 gap1 不应移除
|
||||||
|
flags2[i2] = False # 新的 gap2 不应添加
|
||||||
|
# gap2 新加入
|
||||||
|
for i2, f2 in enumerate(flags2):
|
||||||
|
if f2:
|
||||||
|
new_gaps1.append(gaps2[i2])
|
||||||
|
# 记录 gaps1 彻底移除的项
|
||||||
|
del_gaps1 = []
|
||||||
|
for i1, f1 in enumerate(flags1):
|
||||||
|
if f1:
|
||||||
|
del_gaps1.append(gaps1[i1])
|
||||||
|
|
||||||
|
return new_gaps1, del_gaps1
|
||||||
|
|
||||||
|
# ========================================
|
||||||
|
|
||||||
|
page_l -= 1 # 保证页面左右边缘不与文本块重叠
|
||||||
|
page_r += 1
|
||||||
|
# 存放所有行。“row”指同一水平线上的单元块(可能属于多列)。 [ [unit...] ]
|
||||||
|
rows = []
|
||||||
|
# 已生成完毕的竖切线。[ ( 左边缘x, 右边缘x , 起始行号, 结束行号 ) ]
|
||||||
|
completed_cuts = []
|
||||||
|
# 考察中的间隙。 [ (左边缘x, 右边缘x , 开始行号) ]
|
||||||
|
gaps = []
|
||||||
|
row_index = 0 # 当前行号
|
||||||
|
unit_index = 0 # 当前块号
|
||||||
|
# 从上到下遍历所有文本行
|
||||||
|
l_units = len(units)
|
||||||
|
while unit_index < l_units:
|
||||||
|
# ========== 查找当前行 row ==========
|
||||||
|
unit = units[unit_index] # 当前行最顶部的块
|
||||||
|
u_bottom = unit[0][3]
|
||||||
|
row = [unit] # 当前行
|
||||||
|
# 查找当前行的剩余块
|
||||||
|
for i in range(unit_index + 1, len(units)):
|
||||||
|
next_u = units[i]
|
||||||
|
next_top = next_u[0][1]
|
||||||
|
if next_top > u_bottom:
|
||||||
|
break # 下一块的顶部超过当前底部,结束本行
|
||||||
|
row.append(next_u) # 当前行添加块
|
||||||
|
unit_index = i # 步进 已遍历的块序号
|
||||||
|
# ========== 查找当前行的间隙 row_gaps ==========
|
||||||
|
row.sort(key=lambda x: (x[0][0], x[0][2])) # 当前行中的块 从左到右排序
|
||||||
|
row_gaps = [] # 当前行的间隙 [ ( ( 左边缘l, 右边缘r ), 开始行号) ]
|
||||||
|
search_start = page_l # 本轮搜索的线段起始点为页面左边缘
|
||||||
|
for u in row: # 遍历当前行的块
|
||||||
|
l = u[0][0] # 块左侧
|
||||||
|
r = u[0][2] # 块右侧
|
||||||
|
# 若块起始点大于搜索起始点,那么将这部分加入到结果
|
||||||
|
if l > search_start:
|
||||||
|
row_gaps.append((search_start, l, row_index))
|
||||||
|
# 若块结束点大于搜索起始点,更新搜索起始点
|
||||||
|
if r > search_start:
|
||||||
|
search_start = r
|
||||||
|
# 页面右边缘 加入最后一个间隙
|
||||||
|
row_gaps.append((search_start, page_r, row_index))
|
||||||
|
# ========== 更新考察中的间隙组 ==========
|
||||||
|
gaps, del_gaps = update_gaps(gaps, row_gaps)
|
||||||
|
# gaps 中被移除的项,加入生成完毕的竖切线 completed_cuts
|
||||||
|
row_max = row_index - 1 # 竖切线结束行号
|
||||||
|
for dg1 in del_gaps:
|
||||||
|
completed_cuts.append((*dg1, row_max))
|
||||||
|
# ========== End ==========
|
||||||
|
rows.append(row) # 总行列表添加当前行
|
||||||
|
unit_index += 1
|
||||||
|
row_index += 1
|
||||||
|
# 遍历结束,收集 gaps 中剩余的间隙,组成延伸到最后一行的竖切线
|
||||||
|
row_max = len(rows) - 1 # 竖切线结束行号
|
||||||
|
for g in gaps:
|
||||||
|
completed_cuts.append((*g, row_max))
|
||||||
|
completed_cuts.sort(key=lambda c: c[0])
|
||||||
|
return completed_cuts, rows
|
||||||
|
|
||||||
|
# ======================= 求布局树 =====================
|
||||||
|
"""
|
||||||
|
一个布局树节点表示一个区块。定义:
|
||||||
|
node = {
|
||||||
|
"x_left": 节点左边缘x,
|
||||||
|
"x_right": 右边缘x,
|
||||||
|
"r_top": 顶部的行号,
|
||||||
|
"r_bottom": 底部的行号,
|
||||||
|
"units": [], # 节点内部的文本块列表(除了根节点为空,其它节点非空)
|
||||||
|
"children": [], # 子节点,有序
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_layout_tree(self, cuts, rows):
|
||||||
|
# 竖切线,将一个横行切开,断开的区域为“间隙”。
|
||||||
|
# 生成每一行对应的间隙 (左侧,右侧) 坐标列表
|
||||||
|
rows_gaps = [[] for _ in rows]
|
||||||
|
for g_i, cut in enumerate(cuts):
|
||||||
|
for r_i in range(cut[2], cut[3] + 1):
|
||||||
|
rows_gaps[r_i].append((cut[0], cut[1]))
|
||||||
|
|
||||||
|
root = { # 根节点
|
||||||
|
"x_left": cuts[0][0] - 1,
|
||||||
|
"x_right": cuts[-1][1] + 1,
|
||||||
|
"r_top": -1,
|
||||||
|
"r_bottom": -1,
|
||||||
|
"units": [],
|
||||||
|
"children": [],
|
||||||
|
}
|
||||||
|
completed_nodes = [root] # 已经完成结束的节点
|
||||||
|
now_nodes = [] # 当前正在考虑的节点。无顺序
|
||||||
|
|
||||||
|
# ========== 结束一个节点,加入节点树 ==========
|
||||||
|
def complete(node):
|
||||||
|
node_r = node["x_right"] - 2 # 当前节点右边界
|
||||||
|
max_nodes = [] # 符合父节点条件的,最低的完成节点列表
|
||||||
|
max_r = -2 # 符合父节点条件的最低行数
|
||||||
|
# 在完成列表中,寻找父节点
|
||||||
|
for com_node in completed_nodes:
|
||||||
|
# 父节点的垂直投影必须包含当前右界
|
||||||
|
if node_r < com_node["x_left"] or node_r > com_node["x_right"] + 0.0001:
|
||||||
|
continue
|
||||||
|
# 父节点底部必须在当前之上
|
||||||
|
if com_node["r_bottom"] >= node["r_top"]:
|
||||||
|
continue
|
||||||
|
# 遇到更低的符合条件节点
|
||||||
|
if com_node["r_bottom"] > max_r:
|
||||||
|
max_r = com_node["r_bottom"]
|
||||||
|
max_nodes = [com_node]
|
||||||
|
continue
|
||||||
|
# 遇到同样低的符合条件节点
|
||||||
|
if com_node["r_bottom"] == max_r:
|
||||||
|
max_nodes.append(com_node)
|
||||||
|
continue
|
||||||
|
# 在最低列表中,寻找最右的节点作为父节点
|
||||||
|
max_node = max(max_nodes, key=lambda n: n["x_right"])
|
||||||
|
max_node["children"].append(node) # 加入父节点
|
||||||
|
completed_nodes.append(node) # 加入完成列表
|
||||||
|
|
||||||
|
# ========== 遍历每行,更新节点树 ==========
|
||||||
|
for r_i, row in enumerate(rows):
|
||||||
|
row_gaps = rows_gaps[r_i] # 当前行的间隙组
|
||||||
|
u_i = g_i = 0 # 当前考察的 文本块、间隙下标
|
||||||
|
|
||||||
|
# ========== 检查是否有正在考虑的节点 可以结束 ==========
|
||||||
|
new_nodes = []
|
||||||
|
for node in now_nodes: # 遍历节点
|
||||||
|
l_flag = r_flag = False # 标记节点左右边缘是否延续
|
||||||
|
completed_flag = False # 标记节点是否可以结束
|
||||||
|
x_left = node["x_left"] # 左右边缘坐标
|
||||||
|
x_right = node["x_right"]
|
||||||
|
for gap in row_gaps: # 遍历该行所有间隙
|
||||||
|
if gap[1] == x_left: # 节点左边缘被间隙右侧延续
|
||||||
|
l_flag = True
|
||||||
|
if gap[0] == x_right: # 右边缘被间隙左侧延续
|
||||||
|
r_flag = True
|
||||||
|
# 任意间隙在本节点下方,打断本节点
|
||||||
|
if x_left < gap[0] < x_right or x_left < gap[1] < x_right:
|
||||||
|
completed_flag = True
|
||||||
|
break
|
||||||
|
if not l_flag or not r_flag: # 左右任意一个边缘无法延续
|
||||||
|
completed_flag = True
|
||||||
|
if completed_flag: # 节点结束,加入节点树
|
||||||
|
complete(node)
|
||||||
|
else: # 节点继续
|
||||||
|
node["r_bottom"] = r_i
|
||||||
|
new_nodes.append(node)
|
||||||
|
now_nodes = new_nodes
|
||||||
|
|
||||||
|
# ========== 从左到右遍历,将文本块加入对应列的节点 ==========
|
||||||
|
while u_i < len(row):
|
||||||
|
unit = row[u_i] # 当前块
|
||||||
|
# ========== 当前块 unit 位于间隙 g_i 与 g_i+1 之间的区间 ==========
|
||||||
|
x_l = row_gaps[g_i][1] # 左间隙 g_i 的右边界
|
||||||
|
x_r = row_gaps[g_i + 1][0] # 右间隙 g_i+1 的左边界
|
||||||
|
# 检查区间是否正确
|
||||||
|
if unit[0][0] + 0.0001 > x_r: # 块比右间隙更右,说明到了下一个区间
|
||||||
|
g_i += 1 # 间隙步进,块不步进
|
||||||
|
continue
|
||||||
|
# ========== 检查当前块可否加入已有的节点 ==========
|
||||||
|
flag = False
|
||||||
|
for node in now_nodes:
|
||||||
|
# 若某个节点的左右侧坐标,与当前块一致,则当前块加入节点
|
||||||
|
if node["x_left"] == x_l and node["x_right"] == x_r:
|
||||||
|
node["units"].append(unit)
|
||||||
|
flag = True
|
||||||
|
break
|
||||||
|
if flag:
|
||||||
|
u_i += 1 # 块步进
|
||||||
|
continue
|
||||||
|
# ========== 根据当前块创建新的节点,加入待考虑节点 ==========
|
||||||
|
now_nodes.append(
|
||||||
|
{
|
||||||
|
"x_left": x_l,
|
||||||
|
"x_right": x_r,
|
||||||
|
"r_top": r_i,
|
||||||
|
"r_bottom": r_i,
|
||||||
|
"units": [unit],
|
||||||
|
"children": [],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
u_i += 1 # 块步进
|
||||||
|
# 将剩余节点也加入节点树
|
||||||
|
for node in now_nodes:
|
||||||
|
complete(node)
|
||||||
|
# 整理所有节点
|
||||||
|
for node in completed_nodes:
|
||||||
|
# 所有子节点 按从左到右排序
|
||||||
|
node["children"].sort(key=lambda n: n["x_left"])
|
||||||
|
# 所有块单元 按从上到下排序
|
||||||
|
node["units"].sort(key=lambda u: u[0][1])
|
||||||
|
return root
|
||||||
|
|
||||||
|
# ======================= 前序遍历布局树,求节点序列 =====================
|
||||||
|
def _preorder_traversal(self, root):
|
||||||
|
if not root:
|
||||||
|
return []
|
||||||
|
stack = [root]
|
||||||
|
result = []
|
||||||
|
while stack:
|
||||||
|
node = stack.pop()
|
||||||
|
result.append(node)
|
||||||
|
# 将当前节点的子节点逆序压入栈中,以保证左子节点先于右子节点处理
|
||||||
|
stack += reversed(node["children"])
|
||||||
|
return result
|
||||||
|
|
||||||
|
# ======================= 从节点序列中,提取原始文本块序列 =====================
|
||||||
|
def _get_text_blocks(self, nodes):
|
||||||
|
result = []
|
||||||
|
for node in nodes:
|
||||||
|
for unit in node["units"]:
|
||||||
|
result.append(unit[1])
|
||||||
|
return result
|
@ -0,0 +1,95 @@
|
|||||||
|
# =========================================
|
||||||
|
# =============== 按行预处理 ===============
|
||||||
|
# =========================================
|
||||||
|
|
||||||
|
from math import atan2, cos, sin, sqrt, pi, radians, degrees
|
||||||
|
from statistics import median # 中位数
|
||||||
|
|
||||||
|
angle_threshold = 3 # 进行一些操作的最小角度阈值
|
||||||
|
angle_threshold_rad = radians(angle_threshold)
|
||||||
|
|
||||||
|
|
||||||
|
# 计算两点之间的距离
|
||||||
|
def _distance(point1, point2):
|
||||||
|
return sqrt((point2[0] - point1[0]) ** 2 + (point2[1] - point1[1]) ** 2)
|
||||||
|
|
||||||
|
|
||||||
|
# 计算一个box的旋转角度
|
||||||
|
def _calculateAngle(box):
|
||||||
|
# 获取宽高
|
||||||
|
width = _distance(box[0], box[1])
|
||||||
|
height = _distance(box[1], box[2])
|
||||||
|
# 选择距离较大的两个顶点对,计算角度弧度值
|
||||||
|
if width < height:
|
||||||
|
angle_rad = atan2(box[2][1] - box[1][1], box[2][0] - box[1][0])
|
||||||
|
else:
|
||||||
|
angle_rad = atan2(box[1][1] - box[0][1], box[1][0] - box[0][0])
|
||||||
|
# 标准化角度到[-pi/2, pi/2)范围(加上阈值)
|
||||||
|
if angle_rad < -pi / 2 + angle_threshold_rad:
|
||||||
|
angle_rad += pi
|
||||||
|
elif angle_rad >= pi / 2 + angle_threshold_rad:
|
||||||
|
angle_rad -= pi
|
||||||
|
return angle_rad
|
||||||
|
|
||||||
|
|
||||||
|
# 估计一组文本块的旋转角度
|
||||||
|
def _estimateRotation(textBlocks):
|
||||||
|
# blocks["box"] = [左上角,右上角,右下角,左下角]
|
||||||
|
angle_rads = (_calculateAngle(block["box"]) for block in textBlocks)
|
||||||
|
median_angle = median(angle_rads) # 中位数
|
||||||
|
return median_angle
|
||||||
|
|
||||||
|
|
||||||
|
# 获取旋转后的标准bbox。angle_threshold为执行旋转的阈值(最小角度值)。
|
||||||
|
def _getBboxes(textBlocks, rotation_rad):
|
||||||
|
# 角度低于阈值(接近0°),则不进行旋转,以提高性能。
|
||||||
|
if abs(rotation_rad) <= angle_threshold_rad:
|
||||||
|
bboxes = [
|
||||||
|
( # 直接构造bbox
|
||||||
|
min(x for x, y in tb["box"]),
|
||||||
|
min(y for x, y in tb["box"]),
|
||||||
|
max(x for x, y in tb["box"]),
|
||||||
|
max(y for x, y in tb["box"]),
|
||||||
|
)
|
||||||
|
for tb in textBlocks
|
||||||
|
]
|
||||||
|
# 否则,进行旋转操作。
|
||||||
|
else:
|
||||||
|
# print(f"文本块预处理旋转 {degrees(rotation_rad):.2f} °")
|
||||||
|
bboxes = []
|
||||||
|
min_x, min_y = float("inf"), float("inf") # 初始化最小的x和y坐标
|
||||||
|
cos_angle = cos(-rotation_rad) # 计算角度正弦值
|
||||||
|
sin_angle = sin(-rotation_rad)
|
||||||
|
for tb in textBlocks:
|
||||||
|
box = tb["box"]
|
||||||
|
rotated_box = [ # 旋转box的每个顶点
|
||||||
|
(cos_angle * x - sin_angle * y, sin_angle * x + cos_angle * y)
|
||||||
|
for x, y in box
|
||||||
|
]
|
||||||
|
# 解包旋转后的顶点坐标,分别得到所有x和y的值
|
||||||
|
xs, ys = zip(*rotated_box)
|
||||||
|
# 构建标准bbox (左上角x, 左上角y, 右下角x, 右下角y)
|
||||||
|
bbox = (min(xs), min(ys), max(xs), max(ys))
|
||||||
|
bboxes.append(bbox)
|
||||||
|
min_x, min_y = min(min_x, bbox[0]), min(min_y, bbox[1])
|
||||||
|
# 如果旋转后存在负坐标,将所有包围盒平移,使得最小的x和y坐标为0,确保所有坐标非负
|
||||||
|
if min_x < 0 or min_y < 0:
|
||||||
|
bboxes = [
|
||||||
|
(x - min_x, y - min_y, x2 - min_x, y2 - min_y)
|
||||||
|
for (x, y, x2, y2) in bboxes
|
||||||
|
]
|
||||||
|
return bboxes
|
||||||
|
|
||||||
|
|
||||||
|
# 预处理 textBlocks ,将包围盒 ["box"] 转为标准化 bbox
|
||||||
|
def linePreprocessing(textBlocks):
|
||||||
|
# 判断角度
|
||||||
|
rotation_rad = _estimateRotation(textBlocks)
|
||||||
|
# 获取标准化bbox
|
||||||
|
bboxes = _getBboxes(textBlocks, rotation_rad)
|
||||||
|
# 写入tb
|
||||||
|
for i, tb in enumerate(textBlocks):
|
||||||
|
tb["normalized_bbox"] = bboxes[i]
|
||||||
|
# 按y排序
|
||||||
|
textBlocks.sort(key=lambda tb: tb["normalized_bbox"][1])
|
||||||
|
return textBlocks
|
@ -0,0 +1,173 @@
|
|||||||
|
# 段落分析器
|
||||||
|
# 对已经是一个列区块之内的文本块,判断其段落关系。
|
||||||
|
|
||||||
|
from typing import Callable
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
# 传入前句尾字符和后句首字符,返回分隔符
|
||||||
|
def word_separator(letter1, letter2):
|
||||||
|
|
||||||
|
# 判断Unicode字符是否属于中文、日文或韩文字符集
|
||||||
|
def is_cjk(character):
|
||||||
|
cjk_unicode_ranges = [
|
||||||
|
(0x4E00, 0x9FFF), # 中文
|
||||||
|
(0x3040, 0x30FF), # 日文
|
||||||
|
(0x1100, 0x11FF), # 韩文
|
||||||
|
(0x3130, 0x318F), # 韩文兼容字母
|
||||||
|
(0xAC00, 0xD7AF), # 韩文音节
|
||||||
|
# 全角符号
|
||||||
|
(0x3000, 0x303F), # 中文符号和标点
|
||||||
|
(0xFE30, 0xFE4F), # 中文兼容形式标点
|
||||||
|
(0xFF00, 0xFFEF), # 半角和全角形式字符
|
||||||
|
]
|
||||||
|
return any(start <= ord(character) <= end for start, end in cjk_unicode_ranges)
|
||||||
|
|
||||||
|
if is_cjk(letter1) and is_cjk(letter2):
|
||||||
|
return ""
|
||||||
|
|
||||||
|
# 特殊情况:前文为连字符。
|
||||||
|
if letter1 == "-":
|
||||||
|
return ""
|
||||||
|
# 特殊情况:后文为任意标点符号。
|
||||||
|
if unicodedata.category(letter2).startswith("P"):
|
||||||
|
return ""
|
||||||
|
# 其它正常情况加空格
|
||||||
|
return " "
|
||||||
|
|
||||||
|
|
||||||
|
TH = 1.2 # 行高用作对比的阈值
|
||||||
|
|
||||||
|
|
||||||
|
class ParagraphParse:
|
||||||
|
def __init__(self, get_info: Callable, set_end: Callable) -> None:
|
||||||
|
"""
|
||||||
|
:param get_info: 函数,传入单个文本块,
|
||||||
|
返回该文本块的信息元组: ( (x0, y0, x1, y1), "文本" )
|
||||||
|
:param set_end: 函数,传入单个文本块 和文本尾部的分隔符,该函数要将分隔符保存。
|
||||||
|
"""
|
||||||
|
self.get_info = get_info
|
||||||
|
self.set_end = set_end
|
||||||
|
|
||||||
|
# ======================= 调用接口:对文本块列表进行结尾分隔符预测 =====================
|
||||||
|
def run(self, text_blocks: list):
|
||||||
|
"""
|
||||||
|
对属于一个区块内的文本块列表,进行段落分析,预测每个文本块结尾的分隔符。
|
||||||
|
|
||||||
|
:param text_blocks: 文本块对象列表
|
||||||
|
:return: 排序后的文本块列表
|
||||||
|
"""
|
||||||
|
# 封装块单元
|
||||||
|
units = self._get_units(text_blocks, self.get_info)
|
||||||
|
# 执行分析
|
||||||
|
self._parse(units)
|
||||||
|
return text_blocks
|
||||||
|
|
||||||
|
# ======================= 封装块单元列表 =====================
|
||||||
|
# 将原始文本块,封装为 ( (x0,y0,x2,y2), ("开头","结尾"), 原始 ) 。
|
||||||
|
def _get_units(self, text_blocks, get_info):
|
||||||
|
units = []
|
||||||
|
for tb in text_blocks:
|
||||||
|
bbox, text = get_info(tb)
|
||||||
|
units.append((bbox, (text[0], text[-1]), tb))
|
||||||
|
return units
|
||||||
|
|
||||||
|
# ======================= 分析 =====================
|
||||||
|
|
||||||
|
# 执行分析
|
||||||
|
def _parse(self, units):
|
||||||
|
units.sort(key=lambda a: a[0][1]) # 确保从上到下有序
|
||||||
|
para_l, para_top, para_r, para_bottom = units[0][0] # 当前段的左右
|
||||||
|
para_line_h = para_bottom - para_top # 当前段行高
|
||||||
|
para_line_s = None # 当前段行间距
|
||||||
|
now_para = [units[0]] # 当前段的块
|
||||||
|
paras = [] # 总的段
|
||||||
|
paras_line_space = [] # 总的段的行间距
|
||||||
|
# 取 左右相等为一个自然段的主体
|
||||||
|
for i in range(1, len(units)):
|
||||||
|
l, top, r, bottom = units[i][0] # 当前块上下左右边缘
|
||||||
|
h = bottom - top
|
||||||
|
ls = top - para_bottom # 行间距
|
||||||
|
# 检测是否同一段
|
||||||
|
if ( # 左右边缘都相等
|
||||||
|
abs(para_l - l) <= para_line_h * TH
|
||||||
|
and abs(para_r - r) <= para_line_h * TH
|
||||||
|
# 行间距不大
|
||||||
|
and (para_line_s == None or ls < para_line_s + para_line_h * 0.5)
|
||||||
|
):
|
||||||
|
# 更新数据
|
||||||
|
para_l = (para_l + l) / 2
|
||||||
|
para_r = (para_r + r) / 2
|
||||||
|
para_line_h = (para_line_h + h) / 2
|
||||||
|
para_line_s = ls if para_line_s == None else (para_line_s + ls) / 2
|
||||||
|
# 添加到当前段
|
||||||
|
now_para.append(units[i])
|
||||||
|
else: # 非同一段,归档上一段,创建新一段
|
||||||
|
paras.append(now_para)
|
||||||
|
paras_line_space.append(para_line_s)
|
||||||
|
now_para = [units[i]]
|
||||||
|
para_l, para_r, para_line_h = l, r, bottom - top
|
||||||
|
para_line_s = None
|
||||||
|
para_bottom = bottom
|
||||||
|
# 归档最后一段
|
||||||
|
paras.append(now_para)
|
||||||
|
paras_line_space.append(para_line_s)
|
||||||
|
|
||||||
|
# 合并只有1行的段,添加到上/下段作为首/尾句
|
||||||
|
for i1 in reversed(range(len(paras))):
|
||||||
|
para = paras[i1]
|
||||||
|
if len(para) == 1:
|
||||||
|
l, top, r, bottom = para[0][0]
|
||||||
|
up_flag = down_flag = False
|
||||||
|
# 上段末尾条件:左对齐,右不超,行间距够小
|
||||||
|
if i1 > 0:
|
||||||
|
# 检查左右
|
||||||
|
up_l, up_top, up_r, up_bottom = paras[i1 - 1][-1][0]
|
||||||
|
up_dist, up_h = abs(up_l - l), up_bottom - up_top
|
||||||
|
up_flag = up_dist <= up_h * TH and r <= up_r + up_h * TH
|
||||||
|
# 检查行间距
|
||||||
|
if (
|
||||||
|
paras_line_space[i1 - 1] != None
|
||||||
|
and top - up_bottom > paras_line_space[i1 - 1] + up_h * 0.5
|
||||||
|
):
|
||||||
|
up_flag = False
|
||||||
|
# 下段开头条件:右对齐/单行超出,左缩进
|
||||||
|
if i1 < len(paras) - 1:
|
||||||
|
down_l, down_top, down_r, down_bottom = paras[i1 + 1][0][0]
|
||||||
|
down_h = down_bottom - down_top
|
||||||
|
# 左对齐或缩进
|
||||||
|
if down_l - down_h * TH <= l <= down_l + down_h * (1 + TH):
|
||||||
|
if len(paras[i1 + 1]) > 1: # 多行,右对齐
|
||||||
|
down_flag = abs(down_r - r) <= down_h * TH
|
||||||
|
else: # 单行,右可超出
|
||||||
|
down_flag = down_r - down_h * TH < r
|
||||||
|
# 检查行间距
|
||||||
|
if (
|
||||||
|
paras_line_space[i1 + 1] != None
|
||||||
|
and down_top - bottom > paras_line_space[i1 + 1] + down_h * 0.5
|
||||||
|
):
|
||||||
|
down_flag = False
|
||||||
|
|
||||||
|
# 选择添加到上还是下段
|
||||||
|
if up_flag and down_flag: # 两段都符合,则选择垂直距离更近的
|
||||||
|
if top - up_bottom < down_top - bottom:
|
||||||
|
paras[i1 - 1].append(para[0])
|
||||||
|
else:
|
||||||
|
paras[i1 + 1].insert(0, para[0])
|
||||||
|
elif up_flag: # 只有一段符合,直接选择
|
||||||
|
paras[i1 - 1].append(para[0])
|
||||||
|
elif down_flag:
|
||||||
|
paras[i1 + 1].insert(0, para[0])
|
||||||
|
if up_flag or down_flag:
|
||||||
|
del paras[i1]
|
||||||
|
del paras_line_space[i1]
|
||||||
|
|
||||||
|
# 刷新所有段,添加end
|
||||||
|
for para in paras:
|
||||||
|
for i1 in range(len(para) - 1):
|
||||||
|
letter1 = para[i1][1][1] # 行1结尾字母
|
||||||
|
letter2 = para[i1 + 1][1][0] # 行2开头字母
|
||||||
|
sep = word_separator(letter1, letter2)
|
||||||
|
self.set_end(para[i1][2], sep)
|
||||||
|
self.set_end(para[-1][2], "\n")
|
||||||
|
return units
|
22
flask_app/PaddleOCR/python_api/tbpu/tbpu.py
Normal file
22
flask_app/PaddleOCR/python_api/tbpu/tbpu.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
# tbpu : text block processing unit
|
||||||
|
# 文块处理器的基类。
|
||||||
|
# OCR返回的结果中,一项包含文字、包围盒、置信度的元素,称为一个“文块” - text block 。
|
||||||
|
# 文块不一定是完整的一句话或一个段落。反之,一般是零散的文字。
|
||||||
|
# 一个OCR结果常由多个文块组成。
|
||||||
|
# 文块处理器就是:将传入的多个文块进行处理,比如合并、排序、删除文块。
|
||||||
|
|
||||||
|
|
||||||
|
class Tbpu:
|
||||||
|
def __init__(self):
|
||||||
|
self.tbpuName = "文块处理单元-未知"
|
||||||
|
|
||||||
|
def run(self, textBlocks):
|
||||||
|
"""输入:textBlocks文块列表。例:\n
|
||||||
|
[
|
||||||
|
{'box': [[29, 19], [172, 19], [172, 44], [29, 44]], 'score': 0.89, 'text': '文本111'},
|
||||||
|
{'box': [[29, 60], [161, 60], [161, 86], [29, 86]], 'score': 0.75, 'text': '文本222'},
|
||||||
|
]
|
||||||
|
输出:排序后的textBlocks文块列表,每个块增加键:
|
||||||
|
'end' 结尾间隔符
|
||||||
|
"""
|
||||||
|
return textBlocks
|
BIN
flask_app/PaddleOCR/python_api/test.jpg
Normal file
BIN
flask_app/PaddleOCR/python_api/test.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 26 KiB |
BIN
flask_app/PaddleOCR/vcomp140.dll
Normal file
BIN
flask_app/PaddleOCR/vcomp140.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/vcruntime140.dll
Normal file
BIN
flask_app/PaddleOCR/vcruntime140.dll
Normal file
Binary file not shown.
BIN
flask_app/PaddleOCR/vcruntime140_1.dll
Normal file
BIN
flask_app/PaddleOCR/vcruntime140_1.dll
Normal file
Binary file not shown.
594
flask_app/general/OCR调用参考.py
Normal file
594
flask_app/general/OCR调用参考.py
Normal file
@ -0,0 +1,594 @@
|
|||||||
|
# # import re
|
||||||
|
# # import PyPDF2
|
||||||
|
# # import tempfile
|
||||||
|
# # from utils.ocr_engine import OcrEngine
|
||||||
|
# #
|
||||||
|
# #
|
||||||
|
# # # 假设您的OCR函数名为 `ocr_extract`,并接受图片文件路径作为参数
|
||||||
|
# # # 请将此函数替换为您实际的OCR实现
|
||||||
|
# #
|
||||||
|
# # def clean_page_content(text, common_header):
|
||||||
|
# # # 首先删除抬头公共部分
|
||||||
|
# # if common_header: # 确保有公共抬头才进行替换
|
||||||
|
# # for header_line in common_header.split('\n'):
|
||||||
|
# # if header_line.strip(): # 只处理非空行
|
||||||
|
# # # 替换首次出现的完整行
|
||||||
|
# # text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||||||
|
# #
|
||||||
|
# # # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
||||||
|
# # text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||||
|
# # text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
||||||
|
# # text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
||||||
|
# # text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||||
|
# # return text
|
||||||
|
# #
|
||||||
|
# #
|
||||||
|
# # def extract_common_header(pdf_path):
|
||||||
|
# # from PyPDF2 import PdfReader
|
||||||
|
# #
|
||||||
|
# # def get_headers(pdf_document, start_page, pages_to_read):
|
||||||
|
# # headers = []
|
||||||
|
# # for i in range(start_page, min(start_page + pages_to_read, len(pdf_document.pages))):
|
||||||
|
# # page = pdf_document.pages[i]
|
||||||
|
# # text = page.extract_text() or ""
|
||||||
|
# # if text:
|
||||||
|
# # # 只取每页的前三行,去除前后的空白字符
|
||||||
|
# # first_lines = [line.strip() for line in text.strip().split('\n')[:3]]
|
||||||
|
# # headers.append(first_lines)
|
||||||
|
# # return headers
|
||||||
|
# #
|
||||||
|
# # def find_common_headers(headers):
|
||||||
|
# # if not headers:
|
||||||
|
# # return []
|
||||||
|
# #
|
||||||
|
# # # 使用 zip 对齐所有页的对应行
|
||||||
|
# # common_headers = []
|
||||||
|
# # for lines in zip(*headers):
|
||||||
|
# # # 检查所有行是否完全相同
|
||||||
|
# # if all(line == lines[0] for line in lines[1:]):
|
||||||
|
# # common_headers.append(lines[0])
|
||||||
|
# # return common_headers
|
||||||
|
# #
|
||||||
|
# # pdf_document = PdfReader(pdf_path)
|
||||||
|
# # total_pages = len(pdf_document.pages)
|
||||||
|
# #
|
||||||
|
# # # 定义两个提取策略
|
||||||
|
# # strategies = []
|
||||||
|
# # if total_pages >= 3:
|
||||||
|
# # # 策略1:中间的3页
|
||||||
|
# # middle_page = total_pages // 2
|
||||||
|
# # start_page = max(0, middle_page - 1)
|
||||||
|
# # strategies.append((start_page, 3))
|
||||||
|
# # elif total_pages == 2:
|
||||||
|
# # # 策略1:2页
|
||||||
|
# # strategies.append((0, 2))
|
||||||
|
# # else:
|
||||||
|
# # # 策略1:1页
|
||||||
|
# # strategies.append((0, 1))
|
||||||
|
# #
|
||||||
|
# # # 策略2:前三页
|
||||||
|
# # if total_pages >= 3:
|
||||||
|
# # strategies.append((0, 3))
|
||||||
|
# # elif total_pages == 2:
|
||||||
|
# # strategies.append((0, 2))
|
||||||
|
# # elif total_pages == 1:
|
||||||
|
# # strategies.append((0, 1))
|
||||||
|
# #
|
||||||
|
# # common_headers = []
|
||||||
|
# #
|
||||||
|
# # for idx, (start, count) in enumerate(strategies):
|
||||||
|
# # headers = get_headers(pdf_document, start, count)
|
||||||
|
# # if len(headers) < 2:
|
||||||
|
# # continue # 需要至少2页来比较
|
||||||
|
# #
|
||||||
|
# # current_common = find_common_headers(headers)
|
||||||
|
# # if current_common:
|
||||||
|
# # common_headers = current_common
|
||||||
|
# # break # 找到共同部分后退出
|
||||||
|
# # # 如果没有找到,继续下一个策略
|
||||||
|
# #
|
||||||
|
# # return '\n'.join(common_headers)
|
||||||
|
# #
|
||||||
|
# #
|
||||||
|
# # def extract_images_from_page(reader, page):
|
||||||
|
# # images = []
|
||||||
|
# # try:
|
||||||
|
# # for img in page.images:
|
||||||
|
# # xref = img['xref']
|
||||||
|
# # image = reader.extract_image(xref)
|
||||||
|
# # image_bytes = image['image']
|
||||||
|
# # image_ext = image['ext']
|
||||||
|
# # images.append({'data': image_bytes, 'ext': image_ext})
|
||||||
|
# # except Exception as e:
|
||||||
|
# # print(f"提取第{reader.pages.index(page) + 1}页图片时出错: {e}")
|
||||||
|
# # return images
|
||||||
|
# #
|
||||||
|
# #
|
||||||
|
# # def extract_text_by_page(file_path):
|
||||||
|
# # common_header = extract_common_header(file_path)
|
||||||
|
# # # print(f"公共抬头:{common_header}")
|
||||||
|
# # # print("--------------------正文开始-------------------")
|
||||||
|
# # result = ""
|
||||||
|
# # with open(file_path, 'rb') as file:
|
||||||
|
# # reader = PyPDF2.PdfReader(file)
|
||||||
|
# # num_pages = len(reader.pages)
|
||||||
|
# # # print(f"Total pages: {num_pages}")
|
||||||
|
# # for page_num in range(num_pages):
|
||||||
|
# # page = reader.pages[page_num]
|
||||||
|
# # text = page.extract_text() or ""
|
||||||
|
# #
|
||||||
|
# # # 提取图片并进行OCR
|
||||||
|
# # images = extract_images_from_page(reader, page)
|
||||||
|
# # ocr_text = ""
|
||||||
|
# # for image in images:
|
||||||
|
# # image_data = image['data']
|
||||||
|
# # image_ext = image['ext']
|
||||||
|
# # try:
|
||||||
|
# # with tempfile.NamedTemporaryFile(delete=True, suffix='.' + image_ext) as temp_image:
|
||||||
|
# # temp_image.write(image_data)
|
||||||
|
# # temp_image.flush()
|
||||||
|
# # # 调用OCR函数
|
||||||
|
# # ocr_result = OcrEngine.recognize_text_from_image(temp_image.name)
|
||||||
|
# # ocr_text += ocr_result + "\n"
|
||||||
|
# # except Exception as e:
|
||||||
|
# # print(f"处理第{page_num + 1}页图片时出错: {e}")
|
||||||
|
# #
|
||||||
|
# # # 清理文本
|
||||||
|
# # cleaned_text = clean_page_content(text, common_header)
|
||||||
|
# # # 合并OCR文本
|
||||||
|
# # if ocr_text.strip():
|
||||||
|
# # cleaned_text += "\n" + ocr_text
|
||||||
|
# # result += cleaned_text
|
||||||
|
# # return result
|
||||||
|
# #
|
||||||
|
# #
|
||||||
|
# # def extract_text_json_by_page(file_path):
|
||||||
|
# # common_header = extract_common_header(file_path)
|
||||||
|
# # # print(f"公共抬头:{common_header}")
|
||||||
|
# # # print("--------------------正文开始-------------------")
|
||||||
|
# # result = {}
|
||||||
|
# # with open(file_path, 'rb') as file:
|
||||||
|
# # reader = PyPDF2.PdfReader(file)
|
||||||
|
# # num_pages = len(reader.pages)
|
||||||
|
# # # print(f"Total pages: {num_pages}")
|
||||||
|
# # for page_num in range(num_pages):
|
||||||
|
# # page = reader.pages[page_num]
|
||||||
|
# # text = page.extract_text() or ""
|
||||||
|
# #
|
||||||
|
# # # 提取图片并进行OCR
|
||||||
|
# # images = extract_images_from_page(reader, page)
|
||||||
|
# # ocr_text = ""
|
||||||
|
# # for image in images:
|
||||||
|
# # image_data = image['data']
|
||||||
|
# # image_ext = image['ext']
|
||||||
|
# # try:
|
||||||
|
# # with tempfile.NamedTemporaryFile(delete=True, suffix='.' + image_ext) as temp_image:
|
||||||
|
# # temp_image.write(image_data)
|
||||||
|
# # temp_image.flush()
|
||||||
|
# # # 调用OCR函数
|
||||||
|
# # ocr_result = OcrEngine.recognize_text_from_image(temp_image.name)
|
||||||
|
# # ocr_text += ocr_result + "\n"
|
||||||
|
# # except Exception as e:
|
||||||
|
# # print(f"处理第{page_num + 1}页图片时出错: {e}")
|
||||||
|
# #
|
||||||
|
# # # 清理文本
|
||||||
|
# # cleaned_text = clean_page_content(text, common_header)
|
||||||
|
# # # 合并OCR文本
|
||||||
|
# # if ocr_text.strip():
|
||||||
|
# # cleaned_text += "\n" + ocr_text
|
||||||
|
# # result[str(page_num + 1)] = cleaned_text
|
||||||
|
# # return result
|
||||||
|
# #
|
||||||
|
# #
|
||||||
|
# # if __name__ == '__main__':
|
||||||
|
# # pdf_path = "C:/test/iDS-TCE900神捕电警抓拍单元产品介绍.pdf"
|
||||||
|
# # res = extract_text_json_by_page(pdf_path)
|
||||||
|
# # print(res)
|
||||||
|
# import json
|
||||||
|
# import os
|
||||||
|
# import re
|
||||||
|
# import shutil
|
||||||
|
# import uuid
|
||||||
|
#
|
||||||
|
# import PyPDF2
|
||||||
|
# import fitz # PyMuPDF
|
||||||
|
# import tempfile
|
||||||
|
# from utils.ocr_engine import OcrEngine
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# # 假设您的OCR函数名为 `ocr_extract`,并接受图片文件路径作为参数
|
||||||
|
# def ocr_extract(image_path):
|
||||||
|
# # 示例:调用您的OCR脚本并返回识别的文本
|
||||||
|
# # 例如:
|
||||||
|
# # return your_ocr_function(image_path)
|
||||||
|
# # 将图片保存到tmp目录
|
||||||
|
# # shutil.copy(image_path, f'tmp/{uuid.uuid4()}.png')
|
||||||
|
# # return "OCR提取的文本" # 替换为实际的OCR结果
|
||||||
|
# return OcrEngine.recognize_text_from_image(image_path)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def clean_page_content(text, common_header):
|
||||||
|
# # 首先删除抬头公共部分
|
||||||
|
# if common_header: # 确保有公共抬头才进行替换
|
||||||
|
# for header_line in common_header.split('\n'):
|
||||||
|
# if header_line.strip(): # 只处理非空行
|
||||||
|
# # 替换首次出现的完整行
|
||||||
|
# text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||||||
|
#
|
||||||
|
# # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
||||||
|
# text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||||
|
# text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
||||||
|
# text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
||||||
|
# text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||||
|
# return text
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def extract_common_header(pdf_path):
|
||||||
|
# from PyPDF2 import PdfReader
|
||||||
|
#
|
||||||
|
# def get_headers(pdf_document, start_page, pages_to_read):
|
||||||
|
# headers = []
|
||||||
|
# for i in range(start_page, min(start_page + pages_to_read, len(pdf_document.pages))):
|
||||||
|
# page = pdf_document.pages[i]
|
||||||
|
# text = page.extract_text() or ""
|
||||||
|
# if text:
|
||||||
|
# # 只取每页的前三行,去除前后的空白字符
|
||||||
|
# first_lines = [line.strip() for line in text.strip().split('\n')[:3]]
|
||||||
|
# headers.append(first_lines)
|
||||||
|
# return headers
|
||||||
|
#
|
||||||
|
# def find_common_headers(headers):
|
||||||
|
# if not headers:
|
||||||
|
# return []
|
||||||
|
#
|
||||||
|
# # 使用 zip 对齐所有页的对应行
|
||||||
|
# common_headers = []
|
||||||
|
# for lines in zip(*headers):
|
||||||
|
# # 检查所有行是否完全相同
|
||||||
|
# if all(line == lines[0] for line in lines[1:]):
|
||||||
|
# common_headers.append(lines[0])
|
||||||
|
# return common_headers
|
||||||
|
#
|
||||||
|
# pdf_document = PdfReader(pdf_path)
|
||||||
|
# total_pages = len(pdf_document.pages)
|
||||||
|
#
|
||||||
|
# # 定义两个提取策略
|
||||||
|
# strategies = []
|
||||||
|
# if total_pages >= 3:
|
||||||
|
# # 策略1:中间的3页
|
||||||
|
# middle_page = total_pages // 2
|
||||||
|
# start_page = max(0, middle_page - 1)
|
||||||
|
# strategies.append((start_page, 3))
|
||||||
|
# elif total_pages == 2:
|
||||||
|
# # 策略1:2页
|
||||||
|
# strategies.append((0, 2))
|
||||||
|
# else:
|
||||||
|
# # 策略1:1页
|
||||||
|
# strategies.append((0, 1))
|
||||||
|
#
|
||||||
|
# # 策略2:前三页
|
||||||
|
# if total_pages >= 3:
|
||||||
|
# strategies.append((0, 3))
|
||||||
|
# elif total_pages == 2:
|
||||||
|
# strategies.append((0, 2))
|
||||||
|
# elif total_pages == 1:
|
||||||
|
# strategies.append((0, 1))
|
||||||
|
#
|
||||||
|
# common_headers = []
|
||||||
|
#
|
||||||
|
# for idx, (start, count) in enumerate(strategies):
|
||||||
|
# headers = get_headers(pdf_document, start, count)
|
||||||
|
# if len(headers) < 2:
|
||||||
|
# continue # 需要至少2页来比较
|
||||||
|
#
|
||||||
|
# current_common = find_common_headers(headers)
|
||||||
|
# if current_common:
|
||||||
|
# common_headers = current_common
|
||||||
|
# break # 找到共同部分后退出
|
||||||
|
# # 如果没有找到,继续下一个策略
|
||||||
|
#
|
||||||
|
# return '\n'.join(common_headers)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def extract_images_with_pymupdf(pdf_path):
|
||||||
|
# images = []
|
||||||
|
# try:
|
||||||
|
# doc = fitz.open(pdf_path)
|
||||||
|
# for page_num in range(len(doc)):
|
||||||
|
# page = doc.load_page(page_num)
|
||||||
|
# image_list = page.get_images(full=True)
|
||||||
|
# for img in image_list:
|
||||||
|
# xref = img[0]
|
||||||
|
# base_image = doc.extract_image(xref)
|
||||||
|
# image_bytes = base_image['image']
|
||||||
|
# image_ext = base_image.get('ext', 'png')
|
||||||
|
# # image_width = base_image.get['xres']
|
||||||
|
# # image_height = base_image.get['yres']
|
||||||
|
# # images.append({'data': image_bytes, 'ext': image_ext, 'page_num': page_num + 1, 'width': image_width, 'height': image_height}).
|
||||||
|
# images.append({'data': image_bytes, 'ext': image_ext, 'page_num': page_num + 1})
|
||||||
|
#
|
||||||
|
# # 确保输出目录存在
|
||||||
|
# if not os.path.exists("output_dir"):
|
||||||
|
# os.makedirs("output_dir")
|
||||||
|
# print(f"创建输出目录: output_dir")
|
||||||
|
# # 构建图片文件名,例如: page_1_img.png
|
||||||
|
# image_filename = f"page_{page_num}_img.{image_ext}"
|
||||||
|
# image_path = os.path.join("output_dir", image_filename)
|
||||||
|
# try:
|
||||||
|
# with open(image_path, 'wb') as img_file:
|
||||||
|
# img_file.write(image_bytes)
|
||||||
|
# print(f"保存图片: {image_path}")
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"保存图片 {image_filename} 时出错: {e}")
|
||||||
|
#
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"提取图片时出错: {e}")
|
||||||
|
# return images
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def extract_text_json_by_page(file_path):
|
||||||
|
# common_header = extract_common_header(file_path)
|
||||||
|
# # print(f"公共抬头:{common_header}")
|
||||||
|
# # print("--------------------正文开始-------------------")
|
||||||
|
# result = {}
|
||||||
|
# # 如果已经有保存的ocr结果,直接读取
|
||||||
|
# # TODO 待修改
|
||||||
|
# if os.path.exists(file_path + ".json"):
|
||||||
|
# with open(file_path + ".json", 'r', encoding='utf-8') as f:
|
||||||
|
# result = json.load(f)
|
||||||
|
# return result
|
||||||
|
# images = extract_images_with_pymupdf(file_path)
|
||||||
|
# # filtered_images = filter_images(images)
|
||||||
|
# ocr_text_dict = {}
|
||||||
|
# for img in images:
|
||||||
|
# try:
|
||||||
|
# with tempfile.NamedTemporaryFile(delete=False, suffix='.' + img['ext']) as temp_image:
|
||||||
|
# temp_image.write(img['data'])
|
||||||
|
# temp_image.flush()
|
||||||
|
# # 调用OCR函数
|
||||||
|
# ocr_result = ocr_extract(temp_image.name)
|
||||||
|
# if img['page_num'] in ocr_text_dict:
|
||||||
|
# ocr_text_dict[img['page_num']] += ocr_result + "\n"
|
||||||
|
# else:
|
||||||
|
# ocr_text_dict[img['page_num']] = ocr_result + "\n"
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"OCR处理失败: {e}")
|
||||||
|
# finally:
|
||||||
|
# try:
|
||||||
|
# os.remove(temp_image.name)
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"删除临时文件失败: {e}")
|
||||||
|
#
|
||||||
|
# with open(file_path, 'rb') as file:
|
||||||
|
# reader = PyPDF2.PdfReader(file)
|
||||||
|
# num_pages = len(reader.pages)
|
||||||
|
# # print(f"Total pages: {num_pages}")
|
||||||
|
# for page_num in range(num_pages):
|
||||||
|
# page = reader.pages[page_num]
|
||||||
|
# text = page.extract_text() or ""
|
||||||
|
# # 清理文本
|
||||||
|
# cleaned_text = clean_page_content(text, common_header)
|
||||||
|
# # 合并OCR文本
|
||||||
|
# if (page_num + 1) in ocr_text_dict and ocr_text_dict[page_num + 1].strip():
|
||||||
|
# cleaned_text += "\n" + ocr_text_dict[page_num + 1]
|
||||||
|
# result[str(page_num + 1)] = cleaned_text
|
||||||
|
# print("pdf预处理完成")
|
||||||
|
# return result
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# def filter_images(images, min_width=200, min_height=200, text_threshold=5):
|
||||||
|
# """
|
||||||
|
# 过滤图像,保留可能包含文本的图像。
|
||||||
|
# :param images: 图像列表,每个图像包含 data, ext, page_num, width, height
|
||||||
|
# :param min_width: 最小宽度
|
||||||
|
# :param min_height: 最小高度
|
||||||
|
# :param text_threshold: 检测到的文本字符数阈值
|
||||||
|
# :return: 过滤后的图像列表
|
||||||
|
# """
|
||||||
|
# filtered = []
|
||||||
|
# for img in images:
|
||||||
|
# # 基于尺寸过滤
|
||||||
|
# if img['width'] < min_width or img['height'] < min_height:
|
||||||
|
# continue
|
||||||
|
# # 基于文本检测过滤
|
||||||
|
# try:
|
||||||
|
# # with tempfile.NamedTemporaryFile(delete=True, suffix='.' + img['ext']) as temp_image:
|
||||||
|
# # temp_image.write(img['data'])
|
||||||
|
# # temp_image.flush()
|
||||||
|
# # # 使用OCR快速检测文本
|
||||||
|
# # ocr_result = ocr_extract(temp_image.name)
|
||||||
|
# # if len(ocr_result.strip()) >= text_threshold:
|
||||||
|
# # img['ocr_text'] = ocr_result
|
||||||
|
# # filtered.append(img)
|
||||||
|
# filtered.append(img)
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f"过滤图像时出错: {e}")
|
||||||
|
#
|
||||||
|
# return filtered
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# if __name__ == '__main__':
|
||||||
|
# pdf_path = "C:/test/发言系统11.pdf"
|
||||||
|
# res = extract_text_json_by_page(pdf_path)
|
||||||
|
# print(res)
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import tempfile
|
||||||
|
import shutil
|
||||||
|
import uuid
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
from utils.ocr_engine import OcrEngine # 请确保OcrEngine已经正确导入
|
||||||
|
from utils.local_ocr import LocalOCR
|
||||||
|
|
||||||
|
local_ocr = LocalOCR()
|
||||||
|
|
||||||
|
|
||||||
|
def ocr_extract(image_path):
|
||||||
|
# 调用您的OCR引擎来识别图像中的文本
|
||||||
|
# return OcrEngine.recognize_text_from_image(image_path)
|
||||||
|
# 调用本地ocr
|
||||||
|
return local_ocr.run(image_path)
|
||||||
|
|
||||||
|
|
||||||
|
def clean_page_content(text, common_header):
|
||||||
|
# 删除公共抬头
|
||||||
|
if common_header:
|
||||||
|
for header_line in common_header.split('\n'):
|
||||||
|
if header_line.strip():
|
||||||
|
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||||||
|
|
||||||
|
# 删除页码
|
||||||
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 开头页码
|
||||||
|
text = re.sub(r'\s+\d+\s*$', '', text) # 结尾页码
|
||||||
|
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 形如 /129
|
||||||
|
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 形如 '—2—' 或 '-2-'
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def extract_common_header(pdf_path):
|
||||||
|
def get_headers(pdf_document, start_page, pages_to_read):
|
||||||
|
headers = []
|
||||||
|
for i in range(start_page, min(start_page + pages_to_read, len(pdf_document.pages))):
|
||||||
|
page = pdf_document.pages[i]
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
if text:
|
||||||
|
first_lines = [line.strip() for line in text.strip().split('\n')[:3]]
|
||||||
|
headers.append(first_lines)
|
||||||
|
return headers
|
||||||
|
|
||||||
|
def find_common_headers(headers):
|
||||||
|
if not headers:
|
||||||
|
return []
|
||||||
|
|
||||||
|
common_headers = []
|
||||||
|
for lines in zip(*headers):
|
||||||
|
if all(line == lines[0] for line in lines[1:]):
|
||||||
|
common_headers.append(lines[0])
|
||||||
|
return common_headers
|
||||||
|
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages)
|
||||||
|
|
||||||
|
strategies = []
|
||||||
|
if total_pages >= 3:
|
||||||
|
middle_page = total_pages // 2
|
||||||
|
start_page = max(0, middle_page - 1)
|
||||||
|
strategies.append((start_page, 3))
|
||||||
|
elif total_pages == 2:
|
||||||
|
strategies.append((0, 2))
|
||||||
|
else:
|
||||||
|
strategies.append((0, 1))
|
||||||
|
|
||||||
|
if total_pages >= 3:
|
||||||
|
strategies.append((0, 3))
|
||||||
|
elif total_pages == 2:
|
||||||
|
strategies.append((0, 2))
|
||||||
|
elif total_pages == 1:
|
||||||
|
strategies.append((0, 1))
|
||||||
|
|
||||||
|
common_headers = []
|
||||||
|
|
||||||
|
for start, count in strategies:
|
||||||
|
headers = get_headers(pdf_document, start, count)
|
||||||
|
if len(headers) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
current_common = find_common_headers(headers)
|
||||||
|
if current_common:
|
||||||
|
common_headers = current_common
|
||||||
|
break
|
||||||
|
|
||||||
|
return '\n'.join(common_headers)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images_from_page(pdf_path, page_num):
|
||||||
|
images = []
|
||||||
|
try:
|
||||||
|
doc = fitz.open(pdf_path)
|
||||||
|
if page_num < 0 or page_num >= len(doc):
|
||||||
|
print(f"页码 {page_num + 1} 超出范围")
|
||||||
|
return images
|
||||||
|
|
||||||
|
page = doc.load_page(page_num)
|
||||||
|
image_list = page.get_images(full=True)
|
||||||
|
for img in image_list:
|
||||||
|
xref = img[0]
|
||||||
|
base_image = doc.extract_image(xref)
|
||||||
|
image_bytes = base_image['image']
|
||||||
|
image_ext = base_image.get('ext', 'png')
|
||||||
|
images.append({'data': image_bytes, 'ext': image_ext, 'page_num': page_num + 1})
|
||||||
|
except Exception as e:
|
||||||
|
print(f"提取图片时出错: {e}")
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text_json_by_page(file_path, text_threshold=10):
|
||||||
|
"""
|
||||||
|
提取PDF每页的文本,若文本量低于text_threshold,则提取图片并OCR。
|
||||||
|
:param file_path: PDF文件路径
|
||||||
|
:param text_threshold: 文本量阈值(字符数)
|
||||||
|
:return: 字典,键为页码,值为对应的文本
|
||||||
|
"""
|
||||||
|
common_header = extract_common_header(file_path)
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
# 如果已经有保存的OCR结果,直接读取
|
||||||
|
if os.path.exists(file_path + ".json"):
|
||||||
|
with open(file_path + ".json", 'r', encoding='utf-8') as f:
|
||||||
|
result = json.load(f)
|
||||||
|
return result
|
||||||
|
|
||||||
|
try:
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
num_pages = len(reader.pages)
|
||||||
|
|
||||||
|
for page_num in range(num_pages):
|
||||||
|
page = reader.pages[page_num]
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
|
||||||
|
# 检查文本量是否低于阈值
|
||||||
|
if len(cleaned_text.strip()) < text_threshold:
|
||||||
|
print(f"第 {page_num + 1} 页文本量低,开始提取图片并OCR")
|
||||||
|
images = extract_images_from_page(file_path, page_num)
|
||||||
|
for img in images:
|
||||||
|
try:
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.' + img['ext']) as temp_image:
|
||||||
|
temp_image.write(img['data'])
|
||||||
|
temp_image.flush()
|
||||||
|
# 调用OCR函数
|
||||||
|
ocr_result = ocr_extract(temp_image.name)
|
||||||
|
if ocr_result.strip():
|
||||||
|
cleaned_text += "\n" + ocr_result
|
||||||
|
except Exception as e:
|
||||||
|
print(f"OCR处理失败: {e}")
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.remove(temp_image.name)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"删除临时文件失败: {e}")
|
||||||
|
|
||||||
|
result[str(page_num + 1)] = cleaned_text
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理PDF时出错: {e}")
|
||||||
|
|
||||||
|
# 保存结果到JSON文件
|
||||||
|
try:
|
||||||
|
with open(file_path + ".json", 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(result, f, ensure_ascii=False, indent=4)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"保存结果到JSON文件时出错: {e}")
|
||||||
|
|
||||||
|
print("PDF预处理完成")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
pdf_path = "C:/test/所投主要产品检测报告.pdf"
|
||||||
|
res = extract_text_json_by_page(pdf_path)
|
||||||
|
print(json.dumps(res, ensure_ascii=False, indent=4))
|
30
flask_app/general/local_ocr.py
Normal file
30
flask_app/general/local_ocr.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
import os.path
|
||||||
|
from flask_app.PaddleOCR.python_api.PPOCR_api import GetOcrApi
|
||||||
|
from flask_app.PaddleOCR.python_api.tbpu import GetParser
|
||||||
|
|
||||||
|
class LocalOCR:
|
||||||
|
def __init__(self):
|
||||||
|
# 初始化识别器对象,传入 PaddleOCR_json.exe 的路径
|
||||||
|
# 获取当前脚本所在目录
|
||||||
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
|
# 构建绝对路径
|
||||||
|
ocr_path = 'flask_app/PaddleOCR/PaddleOCR-json.exe'
|
||||||
|
ocr = GetOcrApi(ocr_path)
|
||||||
|
# parser = GetParser("single_para")
|
||||||
|
self.ocr = ocr
|
||||||
|
|
||||||
|
def run(self, pic_path):
|
||||||
|
# 识别图片,传入图片路径
|
||||||
|
getObj = self.ocr.run(pic_path)
|
||||||
|
if getObj["code"] == 100:
|
||||||
|
text = ""
|
||||||
|
data = getObj["data"]
|
||||||
|
for boxes in data:
|
||||||
|
# 每个块之间进行分行
|
||||||
|
text += boxes["text"]
|
||||||
|
text += "\n"
|
||||||
|
|
||||||
|
return text
|
||||||
|
else:
|
||||||
|
return ""
|
@ -116,7 +116,7 @@ def save_extracted_text_to_txt(pdf_path, txt_path):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||||||
file_path=r'C:\Users\Administrator\Desktop\货物标\output1\招标文件(实高电子显示屏)_procurement.pdf'
|
file_path=r"C:\Users\Administrator\Desktop\货物标\output1\2-招标文件(广水市教育局封闭管理)_procurement.pdf"
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||||||
|
@ -174,20 +174,32 @@ def extract_business_deviation(procurement):
|
|||||||
|
|
||||||
business_requirements_string = json.dumps(new_data, ensure_ascii=False, indent=4)
|
business_requirements_string = json.dumps(new_data, ensure_ascii=False, indent=4)
|
||||||
# print(business_requirements_string)
|
# print(business_requirements_string)
|
||||||
prompt_template1 = """请帮我从以下文本中摘取商务要求部分,并将信息重新组织,外键名为'商务要求',键值为字符串列表,其中每个字符串为一条商务要求,保留三角▲、五角星★(若有),但是去除开头的序号(若有)。
|
prompt_template1 = """请帮我从以下文本中摘取商务要求部分,并将信息重新组织,键名为'商务要求',键值为字符串列表,其中每个字符串为一条商务要求,保留三角▲、五角星★(若有),但是去除开头的序号(若有)。
|
||||||
#角色
|
#角色
|
||||||
你是一个专业的招投标业务专家,擅长从招标文件中总结商务要求的部分,并逐条列出,作为编写商务要求偏离表的前置准备。
|
你是一个专业的招投标业务专家,擅长从招标文件中总结商务要求的部分,并逐条列出,作为编写商务要求偏离表的前置准备。
|
||||||
|
|
||||||
#要求与指南:
|
#要求与指南:
|
||||||
1. 每条内容需要有实际的含义、要求,不能光有标题性质的表述如'售后服务期限(质保期)及要求'。
|
1. 每条内容需要有实际的含义、要求,不能光有标题性质的表述如'售后服务期限(质保期)及要求'。
|
||||||
2. 你的回答内容需从所给文本中整理,尽量不改变原文的表达,请勿擅自添加三角▲、五角星★;若输入文本中存在嵌套键值对格式,且键值本身符合'商务要求',可直接将其添加至'商务要求'的键值中;若键值本身语义表达不完整,可将键值对拼接之后作为一条商务要求,拼接符号可以是冒号,即':'。
|
2. 你的回答内容需从所给文本中整理,尽量不改变原文的表达,请勿擅自添加三角▲、五角星★(除非以下要求与指南3.的特殊情况)
|
||||||
3. 输入文本中出现三角▲或五角星★开头的地方,请格外重视,不可漏提,若其后面的内容是标题性质的表述、不具备实际的要求,请你保留三角▲或五角星★,根据语义添加至紧跟着的字符串开头中。
|
3. 若输入文本中存在嵌套键值对格式,且键值本身语义完整且符合'商务要求',可直接将其添加至'商务要求'的键值中;若键值本身语义表达不完整,可将键值对用冒号':'拼接之后作为一条商务要求。
|
||||||
3. 若无商务要求,键值为空列表,即[]
|
4. 对于以三角▲或五角星★开头的字符串:
|
||||||
|
a. 如果该字符串仅为标题性质的表述且不具备实际商务要求的含义,请根据语义关联性将其开头的三角▲或五角星★添加到紧随其后的若干(可为一)内容之后,形成完整的商务要求,并确保整个内容连贯。
|
||||||
|
注:默认在该字符串后面的一个字符串开头添加三角▲或五角星★,若有明确的序号或者语义表示了其后若干字符串之间的相关性,那么可在这些字符串开头都添加三角▲或五角星★,作为若干商务要求。
|
||||||
|
b. 如果该字符串已经包含实际的商务要求,那么该内容作为一条完整的商务要求,保留开头的三角▲或五角星★。
|
||||||
|
- 示例输入:
|
||||||
|
```
|
||||||
|
"★ 提供高质量的售后服务,服务期限不少于两年。"
|
||||||
|
```
|
||||||
|
- 示例输出:
|
||||||
|
```
|
||||||
|
"★ 提供高质量的售后服务,服务期限不少于两年。"
|
||||||
|
```
|
||||||
|
c. 无论哪种情况,都需确保不遗漏任何以三角▲或五角星★开头的重要信息。
|
||||||
|
5. 若无商务要求,键值为空列表,即[]
|
||||||
|
|
||||||
### 示例输入如下:
|
### 示例输入如下:
|
||||||
{{
|
{{
|
||||||
"招标要求1": ["▲(1)整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行,并对线上发现违规操作进行记录,通过督察平台推送督办单给线下监督员小程序进行检查。"]
|
"招标要求1": ["▲(1)整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行。","▲ (一) 投标人","1.投标人需要获得 ISO9001 质量管理体系认证 、ISO 14001 环境管理体系认证及 OHSAS18001 职业健康安全管理体系认证。","2.投标人具备网络运营商资格。"]
|
||||||
"招标要求2": {{
|
"招标要求2": {{
|
||||||
"合同履行期限": ["★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。"],
|
"合同履行期限": ["★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。"],
|
||||||
"交货地点": ["采购人指定地点"],
|
"交货地点": ["采购人指定地点"],
|
||||||
@ -198,7 +210,9 @@ def extract_business_deviation(procurement):
|
|||||||
### 对应的参考输出如下:
|
### 对应的参考输出如下:
|
||||||
{{
|
{{
|
||||||
"商务要求":[
|
"商务要求":[
|
||||||
"▲整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行,并对线上发现违规操作进行记录,通过督察平台推送督办单给线下监督员小程序进行检查。",
|
"▲整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行。",
|
||||||
|
"▲投标人 获得 ISO9001 质量管理体系认证 、ISO 14001 环境管理体系认证及 OHSAS18001 职业健康安全管理体系认证。",
|
||||||
|
"▲投标人具备网络运营商资格"
|
||||||
"★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。",
|
"★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。",
|
||||||
"交货地点:采购人指定地点",
|
"交货地点:采购人指定地点",
|
||||||
"本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。",
|
"本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。",
|
||||||
@ -207,7 +221,7 @@ def extract_business_deviation(procurement):
|
|||||||
}}
|
}}
|
||||||
|
|
||||||
文本内容:{full_text}
|
文本内容:{full_text}
|
||||||
"""
|
"""
|
||||||
user_query1 = prompt_template1.format(full_text=business_requirements_string)
|
user_query1 = prompt_template1.format(full_text=business_requirements_string)
|
||||||
model_res1 = doubao_model(user_query1)
|
model_res1 = doubao_model(user_query1)
|
||||||
# print(model_res)
|
# print(model_res)
|
||||||
|
@ -1,5 +1,175 @@
|
|||||||
import os.path
|
import json
|
||||||
|
|
||||||
file_path=r'C:\Users\Administrator\Desktop\货物标\output1\招标文件正文_procurement.pdf'
|
from flask_app.general.doubao import doubao_model
|
||||||
file_name=os.path.basename(file_path)
|
from flask_app.general.json_utils import clean_json_string
|
||||||
print(file_name)
|
|
||||||
|
data={
|
||||||
|
"服务要求": [
|
||||||
|
"投标人和制造商在质量保证期内应当为采购人提供以下技术支持和服务:",
|
||||||
|
"(1) 电话咨询中标人和制造商应当为采购人提供技术援助电话,解答采购人在使用中遇到的问题,及时为采购人提出解决问题的建议。",
|
||||||
|
"▲(2) 现场响应采购人遇到使用及技术问题,电话咨询不能解决的, 中标人和制造商应在 2 小内到达现场(远郊区 4 小时内到达现场)进行处理,确保产品正常工作;无法在 8 小时内解决的,应在 24 小时内提供备用产品, 使采购人能够正常使用。",
|
||||||
|
"(3) 技术升级在质保期内,如果中标人和制造商的产品技术升级,供应商应及时通知采购人,如 采购人有相应要求, 中标人和制造商应对采购人购买的产品进行升级服务。",
|
||||||
|
"(4)在质保期内, 中标人每年至少主动为使用单位提供一次售后服务, 做好售后服务记录(使用单位签字盖章 ), 作为退还质保金的依据。",
|
||||||
|
"▲(三) 备品备件及易损件",
|
||||||
|
"中标人和制造商售后服务中,维修使用的备品备件及易损件应为原厂配件,未经采购人同意不得使用非原厂配件,常用的、容易损坏的备品备件及易损件的价格清单须在投标文件中列出。",
|
||||||
|
"▲供应商对其提供产品的使用和操作应尽培训义务。供应商应提供对采购人的基本免费培训, 使采购人使用人员能够正常操作。"
|
||||||
|
],
|
||||||
|
"商务要求": [
|
||||||
|
"★交货期要求/工期要求签到合同后 15 日历天/60 日历天",
|
||||||
|
"★质保要求三年",
|
||||||
|
"1. 交货期 、交货地点及验收方式",
|
||||||
|
"(一) 交货期(或为: 实施时间)",
|
||||||
|
"1 、 中标人应在采购合同签订后 15 个日历日内完成交货;",
|
||||||
|
"2 、 中标人应在采购合同签订后 60个日历日内完成安装调试并可投入使用。",
|
||||||
|
"(二) 交货地点(或为: 实施地点)",
|
||||||
|
"1 、交货地点: 广水市市内各中小学。",
|
||||||
|
"2 、交货要求:",
|
||||||
|
"(1) 中标人提供的全部货物必须完全符合招标文件要求的品种和数量; 规格型号和技术参数必须完全满足招标文件和中标人在投标中承诺的正偏离的要求;货物的品牌 、生产厂家必须符合中标人在投标中承诺的品牌和生产厂家。",
|
||||||
|
"(2) 中标人提供的货物未达到招标投标文件规定要求, 且对采购人造成损失的, 由中标人承担一切责任, 并赔偿所造成的损失。",
|
||||||
|
"(3) 采购人需要制造商对中标人交付的产品(包括质量 、技术参数等) 进行确认的, 制造商应予以配合, 并出具书面意见。",
|
||||||
|
"(4) 产品包装材料归采购人所有。注:本项目为采购人与使用人(用户)分离的项目 。中标人在送货前须按 采购人的要求 制作送货的相关的表格交采购人审核。",
|
||||||
|
"(三) 验收方式",
|
||||||
|
"1 、到货验收:到货验收由各项目单位(用户) 和中标人共同负责实施 。货物到达各项目单位(用户) 后, 由各项目单位(用户)组织人员和中标人代表在场当面开箱检查,核对货物的品牌、规格型号,查阅货物技术资料、装箱单、合格证等资, 检查外观, 核实货物数量 。到货验收合格条件如下:",
|
||||||
|
"(1) 中标人在合同约定时间内完成交货;",
|
||||||
|
"(2)货物的品牌 、规格型号符合中标人投标的承诺;",
|
||||||
|
"(3)货物技术资料 、装箱单 、合格证等资料齐全;",
|
||||||
|
"(4)货物全新 、完好无损;",
|
||||||
|
"(5)货物数量符合采购人指定各项目单位(用户) 配备数量。",
|
||||||
|
"2 、项目初验:各项目单位货物安装调试完成,运行正常,相关人员技术培训完成后 。中标人向项目单 位(用户)提供完整的项目实施资料(供货清单,货物和合格证、使用说明书、保修卡,货物安装布线图,培训资料,售后服务联系表。规范装订成册) 。 由各项目单位(用户)组织初验, 初验后按采购人要求填写初验合格证明。",
|
||||||
|
"3 、项目终验:项目终验由采购人组织并成立验收小组验收 。项目终验合格条件如下:",
|
||||||
|
"(1)设备技术参数与投标文件和采购合同一致, 性能指标达到 规定的标准;",
|
||||||
|
"(2) 本项目所有项目单位(用户)初验合格(提供项单位初验合格证明 );",
|
||||||
|
"(3) 项目实施资料完整(按采购人要求提供并规范装订成册 ) 。",
|
||||||
|
"(4) 交货 、安装完成时间在合同规定时间内 完成。",
|
||||||
|
"2. 报价要求",
|
||||||
|
"本次报价须为人民币报价,包含:产品价、运输费( 含装卸费)、保险费、安装调试费、 税费、培训费等货到采购人指定地点并完成本项目的所有费用。因投标人自身原因造成漏报、 少报皆由其自行承担责任, 采购人不再补偿。",
|
||||||
|
"3. 质量保证及售后服务",
|
||||||
|
"▲ 1 、投标人应明确承诺: 所投设备免费质保期三年, 并提供设备生产厂家针对此项目的授权书及售后服务承诺函。",
|
||||||
|
"2、投标产品属于国家规定“三包 ”范围的,其产品质量保证期不得低于“三包 ”规定。",
|
||||||
|
"3 、投标人的质量保证期承诺优于国家“三包 ”规定的, 按招标文件要求, 投标人承诺执行。",
|
||||||
|
"4 、投标产品由制造商(指产品生产制造商, 或其负责销售 、售后 服务机构, 以下同) 负责标准售后服务的, 应当在投标文件中予以明确说明,并附制造商售后服务承诺。",
|
||||||
|
"2 、质保期外服务要求",
|
||||||
|
"( 1) 质量保证期过后, 供应商和制造商应同样提供免费电话咨询服务, 并应承诺提供 产品上门维护服务。",
|
||||||
|
"(2) 质量保证期过后, 采购人需要继续由原供应商和制造商提供售后服务 的, 该供应商和制造商应以优惠价格提供售后服务。",
|
||||||
|
"4. 付款方式",
|
||||||
|
"(一)中标人在合同约定时间内完成交货,经采购人审核确认后,中标人出具全额发票, 采购人在 10 个工作日内向市财政局提交申请, 支付合同全额的 40%;",
|
||||||
|
"(二)中标人在合同约定时间内完成所供货物 的安装集成并投入正常使用,经采购人组织验收(终验)合格后, 付到合同总额的 9 5%, 余额 5%留作质保金。",
|
||||||
|
"5. 知识产权",
|
||||||
|
"采购人在中华人民共和国境内使用投标人提供的货物及服务时免受第三方提出的侵犯 其专利权或其它知识产权的起诉。如果第三方提出侵权指控, 中标人应承担由此而引起的一切法律责任和费用。",
|
||||||
|
"6. 培训",
|
||||||
|
"▲供应商对其提供产品的使用和操作应尽培训义务。供应商应提供对采购人的基本免费 培训, 使采购人使用人员能够正常操作。",
|
||||||
|
"7. 投标人及产品制造商要求",
|
||||||
|
"▲ (一) 投标人",
|
||||||
|
"投标人(含集团公司旗下专业子公司) 获得 ISO9001 质量管理体系认证 、ISO 14001 环 境管理体系认证及 OHSAS18001 职业健康安全管理体系认证。投标人(含集团公司)具备工信部核发的《中华人民共和国基础电信业务经营许可证》, 具有合法电信网络运营商资格。以上证明资料提供复印件, 加盖制造商公章。",
|
||||||
|
"▲( 二) 产品制造商",
|
||||||
|
"为保证系统的安全性, 所投视频监控产品制造商需获得中国信息安全测评中心颁发的 《国家安全信息漏洞库(CNNVD) 技术支撑单位等级证书》。提供所投网络产品(交换机) 制造商, 近 2 年第三方权威咨询机构(IDC 或 CCW) 行 业销量排名数据证明。以上证明资料提供复印件, 加盖制造商公章。",
|
||||||
|
"8. 其他",
|
||||||
|
"(一)投标人必须在投标文件中对以上条款和服务承诺明确列出,承诺内容必须达到本 篇及招标文件其他条款的要求。",
|
||||||
|
"(二) 其他未尽事宜由供需双方在采购合同中 详细约定。"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
def extract_business_deviation(procurement):
|
||||||
|
new_data = {}
|
||||||
|
counter = 1
|
||||||
|
if "服务要求" in procurement:
|
||||||
|
new_data[f"招标要求{counter}"] = procurement["服务要求"]
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
# Extract "商务要求"
|
||||||
|
if "商务要求" in procurement:
|
||||||
|
new_data[f"招标要求{counter}"] = procurement["商务要求"]
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
# Extract "其他要求"
|
||||||
|
if "其他要求" in procurement:
|
||||||
|
new_data[f"招标要求{counter}"] = procurement["其他要求"]
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
business_requirements_string = json.dumps(new_data, ensure_ascii=False, indent=4)
|
||||||
|
# print(business_requirements_string)
|
||||||
|
prompt_template1 = """请帮我从以下文本中摘取商务要求部分,并将信息重新组织,键名为'商务要求',键值为字符串列表,其中每个字符串为一条商务要求,保留三角▲、五角星★(若有),但是去除开头的序号(若有)。
|
||||||
|
#角色
|
||||||
|
你是一个专业的招投标业务专家,擅长从招标文件中总结商务要求的部分,并逐条列出,作为编写商务要求偏离表的前置准备。
|
||||||
|
|
||||||
|
#要求与指南:
|
||||||
|
1. 每条内容需要有实际的含义、要求,不能光有标题性质的表述如'售后服务期限(质保期)及要求'。
|
||||||
|
2. 你的回答内容需从所给文本中整理,尽量不改变原文的表达,请勿擅自添加三角▲、五角星★(除非以下要求与指南3.的特殊情况)
|
||||||
|
3. 若输入文本中存在嵌套键值对格式,且键值本身语义完整且符合'商务要求',可直接将其添加至'商务要求'的键值中;若键值本身语义表达不完整,可将键值对用冒号':'拼接之后作为一条商务要求。
|
||||||
|
4. 对于以三角▲或五角星★开头的字符串:
|
||||||
|
a. 如果该字符串仅为标题性质的表述且不具备实际商务要求的含义,请根据语义关联性将其开头的三角▲或五角星★添加到紧随其后的若干(可为一)内容之后,形成完整的商务要求,并确保整个内容连贯。
|
||||||
|
注:默认在该字符串后面的一个字符串开头添加三角▲或五角星★,若有明确的序号或者语义表示了其后若干字符串之间的相关性,那么可在这些字符串开头都添加三角▲或五角星★,作为若干商务要求。
|
||||||
|
b. 如果该字符串已经包含实际的商务要求,那么该内容作为一条完整的商务要求,保留开头的三角▲或五角星★。
|
||||||
|
- 示例输入:
|
||||||
|
```
|
||||||
|
"★ 提供高质量的售后服务,服务期限不少于两年。"
|
||||||
|
```
|
||||||
|
- 示例输出:
|
||||||
|
```
|
||||||
|
"★ 提供高质量的售后服务,服务期限不少于两年。"
|
||||||
|
```
|
||||||
|
c. 无论哪种情况,都需确保不遗漏任何以三角▲或五角星★开头的重要信息。
|
||||||
|
5. 若无商务要求,键值为空列表,即[]
|
||||||
|
|
||||||
|
### 示例输入如下:
|
||||||
|
{{
|
||||||
|
"招标要求1": ["▲(1)整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行。","▲ (一) 投标人","1.投标人需要获得 ISO9001 质量管理体系认证 、ISO 14001 环境管理体系认证及 OHSAS18001 职业健康安全管理体系认证。","2.投标人具备网络运营商资格。"]
|
||||||
|
"招标要求2": {{
|
||||||
|
"合同履行期限": ["★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。"],
|
||||||
|
"交货地点": ["采购人指定地点"],
|
||||||
|
"报价方式": ["(1)本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。","(2)因投标人自身原因造成漏报、少报皆由其自行承担责任,采购人不再补偿。"],
|
||||||
|
"其他要求": ["无。"]
|
||||||
|
}}
|
||||||
|
}}
|
||||||
|
### 对应的参考输出如下:
|
||||||
|
{{
|
||||||
|
"商务要求":[
|
||||||
|
"▲整个平台运行运维服务,须安排人员驻场对平台进行运行维护,采用 4人轮流值班,依照 7×12小时对可视化督察巡控平台进行操作,确保平台稳定运行。",
|
||||||
|
"▲投标人 获得 ISO9001 质量管理体系认证 、ISO 14001 环境管理体系认证及 OHSAS18001 职业健康安全管理体系认证。",
|
||||||
|
"▲投标人具备网络运营商资格"
|
||||||
|
"★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。",
|
||||||
|
"交货地点:采购人指定地点",
|
||||||
|
"本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。",
|
||||||
|
"因投标人自身原因造成漏报、少报皆由其自行承担责任,采购人不再补偿。"
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
文本内容:{full_text}
|
||||||
|
"""
|
||||||
|
user_query1 = prompt_template1.format(full_text=business_requirements_string)
|
||||||
|
print(user_query1)
|
||||||
|
model_res1 = doubao_model(user_query1)
|
||||||
|
print(model_res1)
|
||||||
|
# business_req_deviation = clean_json_string(model_res1)
|
||||||
|
# prompt_template2 = """以下文本是项目采购需求的商务要求部分,请你帮我从键值列表中各字符串中提取带星★或带三角▲的要求项,你的返回格式同输入文本格式,外键名为'商务要求带星',键值为字符串列表,其中每个字符串为带星★或带三角▲的要求项。
|
||||||
|
# 要求与指南:
|
||||||
|
# 1. 每个星★或三角▲要求占据一个字符串。
|
||||||
|
# 2. 若没有带星★或带三角▲的要求项,键值为空列表,即[]
|
||||||
|
#
|
||||||
|
# 特殊情况处理:
|
||||||
|
# 对于输入类似于'技术要求中带★条款项不满足的视为无效投标'这种描述带星★或带三角▲的响应情况的,它本身不是带星或带三角的要求,因此不需要添加进字符串列表中;仅需把本身是带★或带三角▲的要求添加进来。
|
||||||
|
#
|
||||||
|
# ### 示例输入如下:
|
||||||
|
# {{
|
||||||
|
# "商务要求": [
|
||||||
|
# "考虑设备兼容性、项目实施、交付及售后服务",
|
||||||
|
# "★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。",
|
||||||
|
# "▲本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。"
|
||||||
|
# ]
|
||||||
|
# }}
|
||||||
|
# ### 对应的输出如下:
|
||||||
|
# {{
|
||||||
|
# "商务要求带星": [
|
||||||
|
# "★交货期(工期):合同签订之日起 15个日历天内完成,并通过项目验收。",
|
||||||
|
# "▲本项目报价须为固定总价,包含但不限于:采购、实施、调试、试运行、验收、运维等所有完成本项目相关的一切费用。"
|
||||||
|
# ]
|
||||||
|
# }}
|
||||||
|
#
|
||||||
|
# 文本内容:{full_text}
|
||||||
|
# """
|
||||||
|
# user_query2 = prompt_template2.format(full_text=model_res1)
|
||||||
|
# model_res2 = doubao_model(user_query2)
|
||||||
|
# business_star_req_deviation = clean_json_string(model_res2)
|
||||||
|
#
|
||||||
|
# return business_req_deviation, business_star_req_deviation
|
||||||
|
|
||||||
|
extract_business_deviation(data)
|
||||||
|
@ -207,7 +207,11 @@ def generate_template(required_keys, type=1):
|
|||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
return textwrap.dedent(
|
return textwrap.dedent(
|
||||||
f"""请你根据该货物类招标文件中的采购要求部分内容,请告诉我该项目采购的{keys_str}分别是什么,请以json格式返回结果,默认情况下外层键名是{outer_keys_str},键值为字符串列表,每个字符串表示具体的一条要求,可以按原文中的序号作划分(若有序号的话),请按原文内容回答,保留三角▲、五角星★和序号(若有),不要擅自增添内容及序号。注意:1. 若相应要求下存在子标题表示子要求因素,可以将它忽略而不是将它与下文具体要求进行多行合并,或者作为该要求下的嵌套键名,总之字符串列表中只提取具体的要求。2. 请不要提取{another_keys_str}中的内容。
|
f"""请你根据该货物类招标文件中的采购要求部分内容,请告诉我该项目采购的{keys_str}分别是什么,请以json格式返回结果,默认情况下外层键名是{outer_keys_str},键值为字符串列表,每个字符串表示具体的一条要求,可以按原文中的序号作划分(若有序号的话),请按原文内容回答,保留三角▲、五角星★和序号(若有),不要擅自增删内容。
|
||||||
|
|
||||||
|
注意事项:
|
||||||
|
1. 若相应要求下存在子标题表示子要求因素但不具备实际的含义、要求,可以将它忽略而不是将它与下文具体要求进行多行合并,或者作为该要求下的嵌套键名,总之字符串列表中只提取具体的要求。
|
||||||
|
2. 请不要提取{another_keys_str}中的内容。
|
||||||
|
|
||||||
要求与指南:
|
要求与指南:
|
||||||
1. JSON 的结构要求:
|
1. JSON 的结构要求:
|
||||||
@ -220,7 +224,7 @@ def generate_template(required_keys, type=1):
|
|||||||
2. 请优先且准确定位正文部分包含以下关键字的标题:{outer_keys_str},在其之后提取'XX要求'相关内容,尽量避免在无关地方提取内容。
|
2. 请优先且准确定位正文部分包含以下关键字的标题:{outer_keys_str},在其之后提取'XX要求'相关内容,尽量避免在无关地方提取内容。
|
||||||
3. 注意请不要返回Markdown表格语法,必要时使用冒号':'将相关信息拼接在一起
|
3. 注意请不要返回Markdown表格语法,必要时使用冒号':'将相关信息拼接在一起
|
||||||
{specific_instructions}
|
{specific_instructions}
|
||||||
6. 字符串列表中的每个字符串内容需与原文内容保持一致,保留前面的三角▲、五角星★和序号(若有),而且你不可以擅自添加序号。
|
6. 字符串列表中的每个字符串内容需与原文内容保持一致,保留前面的三角▲、五角星★和序号(如果有),但不可以擅自添加这些内容。
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# 过滤示例内容
|
# 过滤示例内容
|
||||||
|
Loading…
x
Reference in New Issue
Block a user