zbparse/flask_app/general/读取文件/读取docx.py

from docx import Document

def read_docx(file_path):
    # 尝试打开文档
    try:
        doc = Document(file_path)
    except Exception as e:
        print(f"Error opening file: {e}")
        return

    # 读取文档中的所有段落并打印它们
    for para in doc.paragraphs:
        print(para.text)
        print("----------------------------")
def read_docx_tables(file_path):
    # 尝试打开文档
    try:
        doc = Document(file_path)
    except Exception as e:
        print(f"Error opening file: {e}")
        return

    # 读取文档中的所有表格
    if not doc.tables:
        print("No tables found in the document.")
        return

    # 遍历文档中的每个表格
    for table_idx, table in enumerate(doc.tables):
        print(f"Table {table_idx + 1}:")
        # 遍历表格中的每一行
        for row_idx, row in enumerate(table.rows):
            row_data = []
            # 遍历每一行中的单元格
            for cell in row.cells:
                row_data.append(cell.text.strip())  # 去除单元格内容前后空白
            print(f"Row {row_idx + 1}: {row_data}")
        print("\n" + "-" * 40 + "\n")  # 打印分隔线

def read_tables_from_docx(file_path):
    # 尝试打开文档
    try:
        doc = Document(file_path)
    except Exception as e:
        print(f"Error opening file: {e}")
        return []

    # 初始化列表来保存符合条件的单元格内容
    cell_contents = []

    # 读取文档中的所有表格
    if not doc.tables:
        print("No tables found in the document.")
        return []

    # 遍历文档中的每个表格
    for table_idx, table in enumerate(doc.tables):
        # 遍历表格中的每一行
        for row_idx, row in enumerate(table.rows):
            # 遍历每一行中的单元格
            for cell in row.cells:
                cell_text = cell.text.strip()  # 去除单元格内容前后空白
                if len(cell_text) > 6:  # 检查文字数量是否大于5
                    cell_contents.append(cell_text)

    # 返回符合条件的单元格内容
    return cell_contents


def read_docx_by_paragraphs(file_path):
    """
    按段落读取指定路径的 .docx 文件。

    参数：
        file_path (str): .docx 文件的路径。

    返回：
        list: 包含所有段落文本的列表。
    """
    try:
        # 打开文档
        doc = Document(file_path)

        # 读取所有段落的文本
        paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]

        return paragraphs
    except Exception as e:
        print(f"读取 .docx 文件时发生错误: {e}")
        return []

if __name__ == "__main__":
    file_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\invalid_added.docx"
    read_docx(file_path)  #按行读取

    # paragraphs = read_docx_by_paragraphs(file_path)   #按段落读取
    #
    # print(f"共读取到 {len(paragraphs)} 个段落。\n")
    # for idx, para in enumerate(paragraphs, 1):
    #     print(f"段落 {idx}: {para}\n")

    # read_docx_tables(file_path)
    # list=read_tables_from_docx(file_path)
    # for i in list:
    #     print(i)
    #     print("--------------")