73 lines
2.4 KiB
Python
73 lines
2.4 KiB
Python
|
from selenium import webdriver
|
|||
|
from bs4 import BeautifulSoup
|
|||
|
import pandas as pd
|
|||
|
import time
|
|||
|
|
|||
|
def clean_text(html_content):
|
|||
|
# 使用BeautifulSoup来解析内容,逐个元素提取文本,避免添加不必要的空格
|
|||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|||
|
text = "" # 初始化一个空字符串用于拼接文本
|
|||
|
for element in soup.stripped_strings: # 遍历所有文本节点,去除首尾空白
|
|||
|
if element == "一、" or element == "二、": # 如果是列表标记,加入换行符
|
|||
|
text += "\n" + element
|
|||
|
else:
|
|||
|
text += element # 直接拼接文本,不添加额外空格
|
|||
|
return text.strip() # 返回处理后的文本
|
|||
|
|
|||
|
def fetch_data(urls):
|
|||
|
# 设置Chrome选项以在后台运行
|
|||
|
options = webdriver.ChromeOptions()
|
|||
|
options.add_argument('headless')
|
|||
|
|
|||
|
# 初始化WebDriver
|
|||
|
driver = webdriver.Chrome(options=options)
|
|||
|
|
|||
|
# 初始化一个空的DataFrame,以存储最终数据
|
|||
|
all_data = pd.DataFrame()
|
|||
|
|
|||
|
for url in urls:
|
|||
|
# 访问页面
|
|||
|
driver.get(url)
|
|||
|
time.sleep(3) # 等待JavaScript执行
|
|||
|
|
|||
|
# 获取页面源码
|
|||
|
html = driver.page_source
|
|||
|
|
|||
|
# 使用BeautifulSoup解析HTML
|
|||
|
soup = BeautifulSoup(html, 'html.parser')
|
|||
|
table = soup.find('table', class_='MsoNormalTable')
|
|||
|
|
|||
|
# 如果页面上有表格
|
|||
|
if table:
|
|||
|
rows = table.find_all('tr')
|
|||
|
temp_dict = {}
|
|||
|
for row in rows:
|
|||
|
columns = row.find_all('td')
|
|||
|
if len(columns) >= 2: # 确保每行至少有两个列(Header和Content)
|
|||
|
header = columns[0].get_text(strip=True)
|
|||
|
content_html = str(columns[1]) # 获取原始HTML内容
|
|||
|
content = clean_text(content_html) # 清洗并去除不必要的空格
|
|||
|
temp_dict[header] = content
|
|||
|
|
|||
|
# 将字典转换为DataFrame,并添加到总的DataFrame中
|
|||
|
df = pd.DataFrame([temp_dict])
|
|||
|
all_data = pd.concat([all_data, df], ignore_index=True)
|
|||
|
|
|||
|
# 关闭浏览器
|
|||
|
driver.quit()
|
|||
|
|
|||
|
return all_data
|
|||
|
|
|||
|
|
|||
|
# 定义要处理的URL列表
|
|||
|
urls = [
|
|||
|
"https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1171824&itemId=4115&generaltype=9",
|
|||
|
# 可以添加更多的URL
|
|||
|
]
|
|||
|
|
|||
|
# 调用函数并获取数据
|
|||
|
result_data = fetch_data(urls)
|
|||
|
|
|||
|
# 保存到Excel文件
|
|||
|
result_data.to_excel('output_data.xlsx', index=False)
|