from docx import Document def read_docx(file_path): # 尝试打开文档 try: doc = Document(file_path) except Exception as e: print(f"Error opening file: {e}") return # 读取文档中的所有段落并打印它们 for para in doc.paragraphs: print(para.text) def read_docx_tables(file_path): # 尝试打开文档 try: doc = Document(file_path) except Exception as e: print(f"Error opening file: {e}") return # 读取文档中的所有表格 if not doc.tables: print("No tables found in the document.") return # 遍历文档中的每个表格 for table_idx, table in enumerate(doc.tables): print(f"Table {table_idx + 1}:") # 遍历表格中的每一行 for row_idx, row in enumerate(table.rows): row_data = [] # 遍历每一行中的单元格 for cell in row.cells: row_data.append(cell.text.strip()) # 去除单元格内容前后空白 print(f"Row {row_idx + 1}: {row_data}") print("\n" + "-" * 40 + "\n") # 打印分隔线 def read_tables_from_docx(file_path): # 尝试打开文档 try: doc = Document(file_path) except Exception as e: print(f"Error opening file: {e}") return [] # 初始化列表来保存符合条件的单元格内容 cell_contents = [] # 读取文档中的所有表格 if not doc.tables: print("No tables found in the document.") return [] # 遍历文档中的每个表格 for table_idx, table in enumerate(doc.tables): # 遍历表格中的每一行 for row_idx, row in enumerate(table.rows): # 遍历每一行中的单元格 for cell in row.cells: cell_text = cell.text.strip() # 去除单元格内容前后空白 if len(cell_text) > 6: # 检查文字数量是否大于5 cell_contents.append(cell_text) # 返回符合条件的单元格内容 return cell_contents if __name__ == "__main__": file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx' # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件 # read_docx(file_path) read_docx_tables(file_path) list=read_tables_from_docx(file_path) for i in list: print(i) print("--------------")