34 lines
1.2 KiB
Python
34 lines
1.2 KiB
Python
# 排版解析-多栏-自然段
|
|
|
|
from .tbpu import Tbpu
|
|
from .parser_tools.line_preprocessing import linePreprocessing # 行预处理
|
|
from .parser_tools.gap_tree import GapTree # 间隙树排序算法
|
|
from .parser_tools.paragraph_parse import ParagraphParse # 段内分析器
|
|
|
|
|
|
class MultiPara(Tbpu):
|
|
def __init__(self):
|
|
self.tbpuName = "排版解析-多栏-自然段"
|
|
|
|
# 间隙树对象
|
|
self.gtree = GapTree(lambda tb: tb["normalized_bbox"])
|
|
|
|
# 段内分析器对象
|
|
get_info = lambda tb: (tb["normalized_bbox"], tb["text"])
|
|
|
|
def set_end(tb, end): # 获取预测的块尾分隔符
|
|
tb["end"] = end
|
|
|
|
self.pp = ParagraphParse(get_info, set_end)
|
|
|
|
def run(self, textBlocks):
|
|
textBlocks = linePreprocessing(textBlocks) # 预处理
|
|
textBlocks = self.gtree.sort(textBlocks) # 构建间隙树
|
|
nodes = self.gtree.get_nodes_text_blocks() # 获取树节点序列
|
|
# 对每个结点,进行自然段分析
|
|
for tbs in nodes:
|
|
self.pp.run(tbs) # 预测结尾分隔符
|
|
for tb in tbs:
|
|
del tb["normalized_bbox"]
|
|
return textBlocks
|