73 lines
2.4 KiB
Python
73 lines
2.4 KiB
Python
from selenium import webdriver
|
||
from bs4 import BeautifulSoup
|
||
import pandas as pd
|
||
import time
|
||
|
||
def clean_text(html_content):
|
||
# 使用BeautifulSoup来解析内容,逐个元素提取文本,避免添加不必要的空格
|
||
soup = BeautifulSoup(html_content, 'html.parser')
|
||
text = "" # 初始化一个空字符串用于拼接文本
|
||
for element in soup.stripped_strings: # 遍历所有文本节点,去除首尾空白
|
||
if element == "一、" or element == "二、": # 如果是列表标记,加入换行符
|
||
text += "\n" + element
|
||
else:
|
||
text += element # 直接拼接文本,不添加额外空格
|
||
return text.strip() # 返回处理后的文本
|
||
|
||
def fetch_data(urls):
|
||
# 设置Chrome选项以在后台运行
|
||
options = webdriver.ChromeOptions()
|
||
options.add_argument('headless')
|
||
|
||
# 初始化WebDriver
|
||
driver = webdriver.Chrome(options=options)
|
||
|
||
# 初始化一个空的DataFrame,以存储最终数据
|
||
all_data = pd.DataFrame()
|
||
|
||
for url in urls:
|
||
# 访问页面
|
||
driver.get(url)
|
||
time.sleep(3) # 等待JavaScript执行
|
||
|
||
# 获取页面源码
|
||
html = driver.page_source
|
||
|
||
# 使用BeautifulSoup解析HTML
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
table = soup.find('table', class_='MsoNormalTable')
|
||
|
||
# 如果页面上有表格
|
||
if table:
|
||
rows = table.find_all('tr')
|
||
temp_dict = {}
|
||
for row in rows:
|
||
columns = row.find_all('td')
|
||
if len(columns) >= 2: # 确保每行至少有两个列(Header和Content)
|
||
header = columns[0].get_text(strip=True)
|
||
content_html = str(columns[1]) # 获取原始HTML内容
|
||
content = clean_text(content_html) # 清洗并去除不必要的空格
|
||
temp_dict[header] = content
|
||
|
||
# 将字典转换为DataFrame,并添加到总的DataFrame中
|
||
df = pd.DataFrame([temp_dict])
|
||
all_data = pd.concat([all_data, df], ignore_index=True)
|
||
|
||
# 关闭浏览器
|
||
driver.quit()
|
||
|
||
return all_data
|
||
|
||
|
||
# 定义要处理的URL列表
|
||
urls = [
|
||
"https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1171824&itemId=4115&generaltype=9",
|
||
# 可以添加更多的URL
|
||
]
|
||
|
||
# 调用函数并获取数据
|
||
result_data = fetch_data(urls)
|
||
|
||
# 保存到Excel文件
|
||
result_data.to_excel('output_data.xlsx', index=False)
|