106 lines
3.2 KiB
Python

from docx import Document
def read_docx(file_path):
# 尝试打开文档
try:
doc = Document(file_path)
except Exception as e:
print(f"Error opening file: {e}")
return
# 读取文档中的所有段落并打印它们
for para in doc.paragraphs:
print(para.text)
print("----------------------------")
def read_docx_tables(file_path):
# 尝试打开文档
try:
doc = Document(file_path)
except Exception as e:
print(f"Error opening file: {e}")
return
# 读取文档中的所有表格
if not doc.tables:
print("No tables found in the document.")
return
# 遍历文档中的每个表格
for table_idx, table in enumerate(doc.tables):
print(f"Table {table_idx + 1}:")
# 遍历表格中的每一行
for row_idx, row in enumerate(table.rows):
row_data = []
# 遍历每一行中的单元格
for cell in row.cells:
row_data.append(cell.text.strip()) # 去除单元格内容前后空白
print(f"Row {row_idx + 1}: {row_data}")
print("\n" + "-" * 40 + "\n") # 打印分隔线
def read_tables_from_docx(file_path):
# 尝试打开文档
try:
doc = Document(file_path)
except Exception as e:
print(f"Error opening file: {e}")
return []
# 初始化列表来保存符合条件的单元格内容
cell_contents = []
# 读取文档中的所有表格
if not doc.tables:
print("No tables found in the document.")
return []
# 遍历文档中的每个表格
for table_idx, table in enumerate(doc.tables):
# 遍历表格中的每一行
for row_idx, row in enumerate(table.rows):
# 遍历每一行中的单元格
for cell in row.cells:
cell_text = cell.text.strip() # 去除单元格内容前后空白
if len(cell_text) > 6: # 检查文字数量是否大于5
cell_contents.append(cell_text)
# 返回符合条件的单元格内容
return cell_contents
def read_docx_by_paragraphs(file_path):
"""
按段落读取指定路径的 .docx 文件。
参数:
file_path (str): .docx 文件的路径。
返回:
list: 包含所有段落文本的列表。
"""
try:
# 打开文档
doc = Document(file_path)
# 读取所有段落的文本
paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
return paragraphs
except Exception as e:
print(f"读取 .docx 文件时发生错误: {e}")
return []
if __name__ == "__main__":
file_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\invalid_added.docx"
read_docx(file_path) #按行读取
# paragraphs = read_docx_by_paragraphs(file_path) #按段落读取
#
# print(f"共读取到 {len(paragraphs)} 个段落。\n")
# for idx, para in enumerate(paragraphs, 1):
# print(f"段落 {idx}: {para}\n")
# read_docx_tables(file_path)
# list=read_tables_from_docx(file_path)
# for i in list:
# print(i)
# print("--------------")