zbparse/flask_app/main/table_content_extraction.py
2024-08-29 16:37:09 +08:00

88 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from docx import Document
import json
def read_tables_from_docx(file_path):
"""读取DOCX文件中的表格数据并以嵌套字典的形式返回."""
doc = Document(file_path)
table_list = {}
cur_title = []
for table in doc.tables:
for i, row in enumerate(table.rows):
cur_level = table_list
temp_title = []
for j, cell in enumerate(row.cells):
text_str = cell.text.strip().replace(' ', '').replace('\n', '') # 移除键中的换行符
if j < len(row.cells) - 1:
if text_str == "":
text_str = cur_title[j] if j < len(cur_title) else "<未识别到上级标题>"
if text_str not in cur_level:
cur_level[text_str] = {}
cur_level = cur_level[text_str]
temp_title.append(text_str)
else:
cell_text = cell.text.strip().replace(' ', '')
if len(temp_title) > 0:
last_key = temp_title[-1]
if last_key in cur_level:
if isinstance(cur_level[last_key], dict):
cur_level[last_key] = f"\n{cell_text}"
else:
cur_level[last_key] += f"\n{cell_text}" # 追加值到已有键
else:
cur_level[last_key] = cell_text # 初始化键的值
else:
last_key = f"{i}行内容"
if last_key in cur_level:
if isinstance(cur_level[last_key], dict):
cur_level[last_key] = f"\n{cell_text}"
else:
cur_level[last_key] += f"\n{cell_text}" # 追加值到'第i行内容'
else:
cur_level[last_key] = cell_text
cur_title = temp_title[:]
return table_list
def flatten_nested_dicts(d):
"""平坦化嵌套字典以便更简洁地保存为JSON."""
keys_to_remove = []
items_to_add = {}
for key, value in list(d.items()):
if isinstance(value, dict):
value = flatten_nested_dicts(value)
if len(value) == 1 and key in value:
keys_to_remove.append(key)
items_to_add[key] = value[key]
elif len(value) == 1 and list(value.keys())[0] == list(value.values())[0]:
items_to_add[key] = list(value.values())[0]
for key in keys_to_remove:
del d[key]
d.update(items_to_add)
return d
def save_data_to_json(data, filename):
"""将数据保存到JSON文件中."""
with open(filename, 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def extract_tables_main(path, output_filename):
# 读取文档表格数据
table_data = read_tables_from_docx(path)
# 平坦化嵌套字典
flattened_data = flatten_nested_dicts(table_data)
# 保存平坦化后的数据到JSON文件
save_data_to_json(flattened_data, output_filename)
print(f"The data has been processed and saved to '{output_filename}'.")
if __name__ == "__main__":
path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03_tobidders_notice_table.docx'
output_filename = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json" # 前附表json文件
extract_tables_main(path, output_filename)