reptile/extract_table.py
2024-07-26 23:44:36 +08:00

73 lines
2.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
def clean_text(html_content):
# 使用BeautifulSoup来解析内容逐个元素提取文本避免添加不必要的空格
soup = BeautifulSoup(html_content, 'html.parser')
text = "" # 初始化一个空字符串用于拼接文本
for element in soup.stripped_strings: # 遍历所有文本节点,去除首尾空白
if element == "一、" or element == "二、": # 如果是列表标记,加入换行符
text += "\n" + element
else:
text += element # 直接拼接文本,不添加额外空格
return text.strip() # 返回处理后的文本
def fetch_data(urls):
# 设置Chrome选项以在后台运行
options = webdriver.ChromeOptions()
options.add_argument('headless')
# 初始化WebDriver
driver = webdriver.Chrome(options=options)
# 初始化一个空的DataFrame以存储最终数据
all_data = pd.DataFrame()
for url in urls:
# 访问页面
driver.get(url)
time.sleep(3) # 等待JavaScript执行
# 获取页面源码
html = driver.page_source
# 使用BeautifulSoup解析HTML
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', class_='MsoNormalTable')
# 如果页面上有表格
if table:
rows = table.find_all('tr')
temp_dict = {}
for row in rows:
columns = row.find_all('td')
if len(columns) >= 2: # 确保每行至少有两个列Header和Content
header = columns[0].get_text(strip=True)
content_html = str(columns[1]) # 获取原始HTML内容
content = clean_text(content_html) # 清洗并去除不必要的空格
temp_dict[header] = content
# 将字典转换为DataFrame并添加到总的DataFrame中
df = pd.DataFrame([temp_dict])
all_data = pd.concat([all_data, df], ignore_index=True)
# 关闭浏览器
driver.quit()
return all_data
# 定义要处理的URL列表
urls = [
"https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1171824&itemId=4115&generaltype=9",
# 可以添加更多的URL
]
# 调用函数并获取数据
result_data = fetch_data(urls)
# 保存到Excel文件
result_data.to_excel('output_data.xlsx', index=False)