zbparse/flask_app/main/读取文件/读取docx.py

from docx import Document

def read_docx(file_path):
    # 尝试打开文档
    try:
        doc = Document(file_path)
    except Exception as e:
        print(f"Error opening file: {e}")
        return

    # 读取文档中的所有段落并打印它们
    for para in doc.paragraphs:
        print(para.text)
def read_docx_tables(file_path):
    # 尝试打开文档
    try:
        doc = Document(file_path)
    except Exception as e:
        print(f"Error opening file: {e}")
        return

    # 读取文档中的所有表格
    if not doc.tables:
        print("No tables found in the document.")
        return

    # 遍历文档中的每个表格
    for table_idx, table in enumerate(doc.tables):
        print(f"Table {table_idx + 1}:")
        # 遍历表格中的每一行
        for row_idx, row in enumerate(table.rows):
            row_data = []
            # 遍历每一行中的单元格
            for cell in row.cells:
                row_data.append(cell.text.strip())  # 去除单元格内容前后空白
            print(f"Row {row_idx + 1}: {row_data}")
        print("\n" + "-" * 40 + "\n")  # 打印分隔线

def read_tables_from_docx(file_path):
    # 尝试打开文档
    try:
        doc = Document(file_path)
    except Exception as e:
        print(f"Error opening file: {e}")
        return []

    # 初始化列表来保存符合条件的单元格内容
    cell_contents = []

    # 读取文档中的所有表格
    if not doc.tables:
        print("No tables found in the document.")
        return []

    # 遍历文档中的每个表格
    for table_idx, table in enumerate(doc.tables):
        # 遍历表格中的每一行
        for row_idx, row in enumerate(table.rows):
            # 遍历每一行中的单元格
            for cell in row.cells:
                cell_text = cell.text.strip()  # 去除单元格内容前后空白
                if len(cell_text) > 6:  # 检查文字数量是否大于5
                    cell_contents.append(cell_text)

    # 返回符合条件的单元格内容
    return cell_contents

if __name__ == "__main__":
    file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
    # output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp"  # 前附表json文件
    # read_docx(file_path)
    read_docx_tables(file_path)
    list=read_tables_from_docx(file_path)
    for i in list:
        print(i)
        print("--------------")
8.29 2024-08-29 16:37:09 +08:00			`from docx import Document`

			`def read_docx(file_path):`
			`# 尝试打开文档`
			`try:`
			`doc = Document(file_path)`
			`except Exception as e:`
			`print(f"Error opening file: {e}")`
			`return`

			`# 读取文档中的所有段落并打印它们`
			`for para in doc.paragraphs:`
			`print(para.text)`
10.10 2024-10-10 21:03:02 +08:00			`def read_docx_tables(file_path):`
			`# 尝试打开文档`
			`try:`
			`doc = Document(file_path)`
			`except Exception as e:`
			`print(f"Error opening file: {e}")`
			`return`

			`# 读取文档中的所有表格`
			`if not doc.tables:`
			`print("No tables found in the document.")`
			`return`
8.29 2024-08-29 16:37:09 +08:00
10.10 2024-10-10 21:03:02 +08:00			`# 遍历文档中的每个表格`
			`for table_idx, table in enumerate(doc.tables):`
			`print(f"Table {table_idx + 1}:")`
			`# 遍历表格中的每一行`
			`for row_idx, row in enumerate(table.rows):`
			`row_data = []`
			`# 遍历每一行中的单元格`
			`for cell in row.cells:`
			`row_data.append(cell.text.strip()) # 去除单元格内容前后空白`
			`print(f"Row {row_idx + 1}: {row_data}")`
			`print("\n" + "-" * 40 + "\n") # 打印分隔线`
8.29 2024-08-29 16:37:09 +08:00
无效标废标提取优化版本 2024-10-11 11:08:38 +08:00			`def read_tables_from_docx(file_path):`
			`# 尝试打开文档`
			`try:`
			`doc = Document(file_path)`
			`except Exception as e:`
			`print(f"Error opening file: {e}")`
			`return []`

			`# 初始化列表来保存符合条件的单元格内容`
			`cell_contents = []`

			`# 读取文档中的所有表格`
			`if not doc.tables:`
			`print("No tables found in the document.")`
			`return []`

			`# 遍历文档中的每个表格`
			`for table_idx, table in enumerate(doc.tables):`
			`# 遍历表格中的每一行`
			`for row_idx, row in enumerate(table.rows):`
			`# 遍历每一行中的单元格`
			`for cell in row.cells:`
			`cell_text = cell.text.strip() # 去除单元格内容前后空白`
			`if len(cell_text) > 6: # 检查文字数量是否大于5`
			`cell_contents.append(cell_text)`

			`# 返回符合条件的单元格内容`
			`return cell_contents`

8.29 2024-08-29 16:37:09 +08:00			`if __name__ == "__main__":`
无效标废标提取优化版本 2024-10-11 11:08:38 +08:00			`file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'`
			`# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件`
10.10 2024-10-10 21:03:02 +08:00			`# read_docx(file_path)`
无效标废标提取优化版本 2024-10-11 11:08:38 +08:00			`read_docx_tables(file_path)`
			`list=read_tables_from_docx(file_path)`
			`for i in list:`
			`print(i)`
			`print("--------------")`