reptile/extract_table2.py

import pandas as pd
from bs4 import BeautifulSoup

# 从文件中读取HTML内容
with open('D:/folder/study/reptile-project/data.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(html_content, 'html.parser')

# 初始化结果字典
results = {
    "行政处罚决定书文号": "",
    "被处罚当事人": "",
    "主要违法违规事实": "",
    "行政处罚依据": "",
    "行政处罚决定": "",
    "作出处罚决定的机关名称": "",
    "作出处罚决定的日期": ""
}

# 获取所有的tr元素
table_rows = soup.find_all('tr')

# 提取信息
if len(table_rows) >= 9:
    results["行政处罚决定书文号"] = table_rows[0].find_all('td')[1].find('p').get_text(strip=True)
    # 个人姓名、单位名称、单位法定代表人姓名
    person_name = table_rows[1].find_all('td')[2].find('p').get_text(strip=True)
    org_name = table_rows[2].find_all('td')[2].find('p').get_text(strip=True)
    legal_rep_name = table_rows[3].find_all('td')[1].find('p').get_text(strip=True)
    # 格式化被处罚当事人信息
    results["被处罚当事人"] = f'"个人姓名": "{person_name}"\n"单位名称": "{org_name}"\n"单位法定代表人（主要负责人）姓名": "{legal_rep_name}"'

    results["主要违法违规事实"] = table_rows[4].find_all('td')[1].find('p').get_text(strip=True)
    results["行政处罚依据"] = table_rows[5].find_all('td')[1].find('p').get_text(strip=True)
    results["行政处罚决定"] = table_rows[6].find_all('td')[1].find('p').get_text(strip=True)
    results["作出处罚决定的机关名称"] = table_rows[7].find_all('td')[1].find('p').get_text(strip=True)
    results["作出处罚决定的日期"] = table_rows[8].find_all('td')[1].find('p').get_text(strip=True)

# 创建DataFrame
df = pd.DataFrame([results])

# 保存DataFrame到Excel文件
df.to_excel('output_data.xlsx', index=False, engine='openpyxl')