from docx import Document import json def read_tables_from_docx(file_path): """读取DOCX文件中的表格数据,并以嵌套字典的形式返回.""" doc = Document(file_path) table_list = {} cur_title = [] for table in doc.tables: for i, row in enumerate(table.rows): cur_level = table_list temp_title = [] for j, cell in enumerate(row.cells): text_str = cell.text.strip().replace(' ', '').replace('\n', '') # 移除键中的换行符 if j < len(row.cells) - 1: if text_str == "": text_str = cur_title[j] if j < len(cur_title) else "<未识别到上级标题>" if text_str not in cur_level: cur_level[text_str] = {} cur_level = cur_level[text_str] temp_title.append(text_str) else: cell_text = cell.text.strip().replace(' ', '') if len(temp_title) > 0: last_key = temp_title[-1] if last_key in cur_level: if isinstance(cur_level[last_key], dict): cur_level[last_key] = f"\n{cell_text}" else: cur_level[last_key] += f"\n{cell_text}" # 追加值到已有键 else: cur_level[last_key] = cell_text # 初始化键的值 else: last_key = f"第{i}行内容" if last_key in cur_level: if isinstance(cur_level[last_key], dict): cur_level[last_key] = f"\n{cell_text}" else: cur_level[last_key] += f"\n{cell_text}" # 追加值到'第i行内容' else: cur_level[last_key] = cell_text cur_title = temp_title[:] return table_list def flatten_nested_dicts(d): """平坦化嵌套字典,以便更简洁地保存为JSON.""" keys_to_remove = [] items_to_add = {} for key, value in list(d.items()): if isinstance(value, dict): value = flatten_nested_dicts(value) if len(value) == 1 and key in value: keys_to_remove.append(key) items_to_add[key] = value[key] elif len(value) == 1 and list(value.keys())[0] == list(value.values())[0]: items_to_add[key] = list(value.values())[0] for key in keys_to_remove: del d[key] d.update(items_to_add) return d def save_data_to_json(data, filename): """将数据保存到JSON文件中.""" with open(filename, 'w', encoding='utf-8') as file: json.dump(data, file, ensure_ascii=False, indent=4) def extract_tables_main(path, output_filename): # 读取文档表格数据 table_data = read_tables_from_docx(path) # 平坦化嵌套字典 flattened_data = flatten_nested_dicts(table_data) # 保存平坦化后的数据到JSON文件 save_data_to_json(flattened_data, output_filename) print(f"The data has been processed and saved to '{output_filename}'.") if __name__ == "__main__": path = 'C:\\Users\\Administrator\\Desktop\\招标文件\\招标03_tobidders_notice_table.docx' output_filename = "C:\\Users\\Administrator\\Desktop\\招标文件\\truncate_output.json" # 前附表json文件 extract_tables_main(path, output_filename)