2024-08-29 16:37:09 +08:00
|
|
|
from docx import Document
|
|
|
|
|
|
|
|
def read_docx(file_path):
|
|
|
|
# 尝试打开文档
|
|
|
|
try:
|
|
|
|
doc = Document(file_path)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error opening file: {e}")
|
|
|
|
return
|
|
|
|
|
|
|
|
# 读取文档中的所有段落并打印它们
|
|
|
|
for para in doc.paragraphs:
|
|
|
|
print(para.text)
|
2024-10-10 21:03:02 +08:00
|
|
|
def read_docx_tables(file_path):
|
|
|
|
# 尝试打开文档
|
|
|
|
try:
|
|
|
|
doc = Document(file_path)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error opening file: {e}")
|
|
|
|
return
|
|
|
|
|
|
|
|
# 读取文档中的所有表格
|
|
|
|
if not doc.tables:
|
|
|
|
print("No tables found in the document.")
|
|
|
|
return
|
2024-08-29 16:37:09 +08:00
|
|
|
|
2024-10-10 21:03:02 +08:00
|
|
|
# 遍历文档中的每个表格
|
|
|
|
for table_idx, table in enumerate(doc.tables):
|
|
|
|
print(f"Table {table_idx + 1}:")
|
|
|
|
# 遍历表格中的每一行
|
|
|
|
for row_idx, row in enumerate(table.rows):
|
|
|
|
row_data = []
|
|
|
|
# 遍历每一行中的单元格
|
|
|
|
for cell in row.cells:
|
|
|
|
row_data.append(cell.text.strip()) # 去除单元格内容前后空白
|
|
|
|
print(f"Row {row_idx + 1}: {row_data}")
|
|
|
|
print("\n" + "-" * 40 + "\n") # 打印分隔线
|
2024-08-29 16:37:09 +08:00
|
|
|
|
2024-10-11 11:08:38 +08:00
|
|
|
def read_tables_from_docx(file_path):
|
|
|
|
# 尝试打开文档
|
|
|
|
try:
|
|
|
|
doc = Document(file_path)
|
|
|
|
except Exception as e:
|
|
|
|
print(f"Error opening file: {e}")
|
|
|
|
return []
|
|
|
|
|
|
|
|
# 初始化列表来保存符合条件的单元格内容
|
|
|
|
cell_contents = []
|
|
|
|
|
|
|
|
# 读取文档中的所有表格
|
|
|
|
if not doc.tables:
|
|
|
|
print("No tables found in the document.")
|
|
|
|
return []
|
|
|
|
|
|
|
|
# 遍历文档中的每个表格
|
|
|
|
for table_idx, table in enumerate(doc.tables):
|
|
|
|
# 遍历表格中的每一行
|
|
|
|
for row_idx, row in enumerate(table.rows):
|
|
|
|
# 遍历每一行中的单元格
|
|
|
|
for cell in row.cells:
|
|
|
|
cell_text = cell.text.strip() # 去除单元格内容前后空白
|
|
|
|
if len(cell_text) > 6: # 检查文字数量是否大于5
|
|
|
|
cell_contents.append(cell_text)
|
|
|
|
|
|
|
|
# 返回符合条件的单元格内容
|
|
|
|
return cell_contents
|
|
|
|
|
2024-08-29 16:37:09 +08:00
|
|
|
if __name__ == "__main__":
|
2024-10-11 11:08:38 +08:00
|
|
|
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\2-招标文件.docx'
|
|
|
|
# output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfilesdocx\\tmp" # 前附表json文件
|
2024-10-10 21:03:02 +08:00
|
|
|
# read_docx(file_path)
|
2024-10-11 11:08:38 +08:00
|
|
|
read_docx_tables(file_path)
|
|
|
|
list=read_tables_from_docx(file_path)
|
|
|
|
for i in list:
|
|
|
|
print(i)
|
|
|
|
print("--------------")
|