zbparse/flask_app/main/table_content_extraction.py

101 lines
3.8 KiB
Python
Raw Normal View History

2024-08-29 16:37:09 +08:00
from docx import Document
import json
2024-09-03 18:04:05 +08:00
2024-08-29 16:37:09 +08:00
def read_tables_from_docx(file_path):
"""读取DOCX文件中的表格数据并以嵌套字典的形式返回."""
doc = Document(file_path)
table_list = {}
cur_title = []
2024-09-03 18:04:05 +08:00
header = None
2024-08-29 16:37:09 +08:00
for table in doc.tables:
for i, row in enumerate(table.rows):
2024-09-03 18:04:05 +08:00
cell_texts = [cell.text.strip() for cell in row.cells]
# 检查是否是表头
if header is None:
header = cell_texts
continue # 跳过第一个表头行
# 如果遇到与第一个表头相同的行,跳过
if cell_texts == header:
continue
2024-08-29 16:37:09 +08:00
cur_level = table_list
temp_title = []
for j, cell in enumerate(row.cells):
2024-09-03 18:04:05 +08:00
text_str = cell.text.strip().replace(' ', '').replace('\n', '')
2024-08-29 16:37:09 +08:00
if j < len(row.cells) - 1:
if text_str == "":
text_str = cur_title[j] if j < len(cur_title) else "<未识别到上级标题>"
if text_str not in cur_level:
cur_level[text_str] = {}
cur_level = cur_level[text_str]
temp_title.append(text_str)
else:
cell_text = cell.text.strip().replace(' ', '')
if len(temp_title) > 0:
last_key = temp_title[-1]
if last_key in cur_level:
if isinstance(cur_level[last_key], dict):
cur_level[last_key] = f"\n{cell_text}"
else:
2024-09-03 18:04:05 +08:00
cur_level[last_key] += f"\n{cell_text}"
2024-08-29 16:37:09 +08:00
else:
2024-09-03 18:04:05 +08:00
cur_level[last_key] = cell_text
2024-08-29 16:37:09 +08:00
else:
last_key = f"{i}行内容"
if last_key in cur_level:
if isinstance(cur_level[last_key], dict):
cur_level[last_key] = f"\n{cell_text}"
else:
2024-09-03 18:04:05 +08:00
cur_level[last_key] += f"\n{cell_text}"
2024-08-29 16:37:09 +08:00
else:
cur_level[last_key] = cell_text
cur_title = temp_title[:]
return table_list
def flatten_nested_dicts(d):
"""平坦化嵌套字典以便更简洁地保存为JSON."""
keys_to_remove = []
items_to_add = {}
for key, value in list(d.items()):
if isinstance(value, dict):
value = flatten_nested_dicts(value)
if len(value) == 1 and key in value:
keys_to_remove.append(key)
items_to_add[key] = value[key]
elif len(value) == 1 and list(value.keys())[0] == list(value.values())[0]:
items_to_add[key] = list(value.values())[0]
for key in keys_to_remove:
del d[key]
d.update(items_to_add)
return d
def save_data_to_json(data, filename):
"""将数据保存到JSON文件中."""
with open(filename, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def extract_tables_main(path, output_filename):
# 读取文档表格数据
table_data = read_tables_from_docx(path)
# 平坦化嵌套字典
flattened_data = flatten_nested_dicts(table_data)
# 保存平坦化后的数据到JSON文件
save_data_to_json(flattened_data, output_filename)
print(f"The data has been processed and saved to '{output_filename}'.")
if __name__ == "__main__":
2024-09-03 18:04:05 +08:00
path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20\\zbtest20_17-22.docx'
output_filename = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest20\\truncate_output.json" # 前附表json文件
2024-08-29 16:37:09 +08:00
extract_tables_main(path, output_filename)