from docx import Document def read_docx(file_path): # 尝试打开文档 try: doc = Document(file_path) except Exception as e: print(f"Error opening file: {e}") return # 读取文档中的所有段落并打印它们 for para in doc.paragraphs: print(para.text) print("----------------------------") def read_docx_tables(file_path): # 尝试打开文档 try: doc = Document(file_path) except Exception as e: print(f"Error opening file: {e}") return # 读取文档中的所有表格 if not doc.tables: print("No tables found in the document.") return # 遍历文档中的每个表格 for table_idx, table in enumerate(doc.tables): print(f"Table {table_idx + 1}:") # 遍历表格中的每一行 for row_idx, row in enumerate(table.rows): row_data = [] # 遍历每一行中的单元格 for cell in row.cells: row_data.append(cell.text.strip()) # 去除单元格内容前后空白 print(f"Row {row_idx + 1}: {row_data}") print("\n" + "-" * 40 + "\n") # 打印分隔线 def read_tables_from_docx(file_path): # 尝试打开文档 try: doc = Document(file_path) except Exception as e: print(f"Error opening file: {e}") return [] # 初始化列表来保存符合条件的单元格内容 cell_contents = [] # 读取文档中的所有表格 if not doc.tables: print("No tables found in the document.") return [] # 遍历文档中的每个表格 for table_idx, table in enumerate(doc.tables): # 遍历表格中的每一行 for row_idx, row in enumerate(table.rows): # 遍历每一行中的单元格 for cell in row.cells: cell_text = cell.text.strip() # 去除单元格内容前后空白 if len(cell_text) > 6: # 检查文字数量是否大于5 cell_contents.append(cell_text) # 返回符合条件的单元格内容 return cell_contents def read_docx_by_paragraphs(file_path): """ 按段落读取指定路径的 .docx 文件。 参数: file_path (str): .docx 文件的路径。 返回: list: 包含所有段落文本的列表。 """ try: # 打开文档 doc = Document(file_path) # 读取所有段落的文本 paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()] return paragraphs except Exception as e: print(f"读取 .docx 文件时发生错误: {e}") return [] if __name__ == "__main__": file_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\invalid_added.docx" read_docx(file_path) #按行读取 # paragraphs = read_docx_by_paragraphs(file_path) #按段落读取 # # print(f"共读取到 {len(paragraphs)} 个段落。\n") # for idx, para in enumerate(paragraphs, 1): # print(f"段落 {idx}: {para}\n") # read_docx_tables(file_path) # list=read_tables_from_docx(file_path) # for i in list: # print(i) # print("--------------")