122 lines
4.9 KiB
Python
122 lines
4.9 KiB
Python
|
import os
|
|||
|
import pandas as pd
|
|||
|
from selenium import webdriver
|
|||
|
from selenium.webdriver.common.by import By
|
|||
|
from selenium.webdriver.support.ui import WebDriverWait
|
|||
|
from selenium.webdriver.support import expected_conditions as EC
|
|||
|
from selenium.webdriver.common.action_chains import ActionChains
|
|||
|
from bs4 import BeautifulSoup
|
|||
|
import time
|
|||
|
from datetime import datetime
|
|||
|
import random
|
|||
|
|
|||
|
|
|||
|
def create_empty_excel(filename):
|
|||
|
"""创建一个包含指定列的空Excel文件。"""
|
|||
|
columns = ["标题", "网址", "日期"]
|
|||
|
df = pd.DataFrame(columns=columns)
|
|||
|
df.to_excel(filename, index=False)
|
|||
|
|
|||
|
|
|||
|
def save_data_to_excel(data, filename):
|
|||
|
"""将收集的数据追加到Excel文件中。如果文件不存在,先创建文件。"""
|
|||
|
new_data_df = pd.DataFrame(data, columns=['标题', '网址', '日期'])
|
|||
|
if not os.path.exists(filename):
|
|||
|
create_empty_excel(filename)
|
|||
|
|
|||
|
# 读取现有的Excel文件到DataFrame
|
|||
|
existing_df = pd.read_excel(filename)
|
|||
|
|
|||
|
# 使用concat而不是append来合并数据框
|
|||
|
updated_df = pd.concat([existing_df, new_data_df], ignore_index=True)
|
|||
|
|
|||
|
# 将更新后的DataFrame写回到Excel文件,覆盖原有文件
|
|||
|
updated_df.to_excel(filename, index=False)
|
|||
|
|
|||
|
def random_wait(min_time=1, max_time=3):
|
|||
|
time.sleep(random.uniform(min_time, max_time))
|
|||
|
|
|||
|
def create_browser():
|
|||
|
options = webdriver.ChromeOptions()
|
|||
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
|||
|
options.add_argument("--headless")
|
|||
|
options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
|||
|
driver = webdriver.Chrome(options=options)
|
|||
|
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
|||
|
'source': '''
|
|||
|
Object.defineProperty(navigator, 'webdriver', {
|
|||
|
get: () => undefined
|
|||
|
});
|
|||
|
window.navigator.chrome = {
|
|||
|
runtime: {}
|
|||
|
};
|
|||
|
Object.defineProperty(navigator, 'languages', {
|
|||
|
get: () => ['en-US', 'en']
|
|||
|
});
|
|||
|
Object.defineProperty(navigator, 'plugins', {
|
|||
|
get: () => [1, 2, 3, 4, 5]
|
|||
|
});
|
|||
|
'''
|
|||
|
})
|
|||
|
return driver
|
|||
|
|
|||
|
def fetch_and_save_data(base_page_url, base_url, date_limit, output_filename='extracted_data.xlsx', batch_size=100):
|
|||
|
driver = create_browser()
|
|||
|
collected_data = []
|
|||
|
driver.get(base_page_url)
|
|||
|
should_continue = True
|
|||
|
cur_page = 0
|
|||
|
batch_number = 1
|
|||
|
|
|||
|
while should_continue:
|
|||
|
cur_page += 1
|
|||
|
print(f"Visiting new page: {cur_page}")
|
|||
|
random_wait()
|
|||
|
|
|||
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
|||
|
div_elements = soup.find_all('div', class_="panel-row ng-scope")
|
|||
|
|
|||
|
for div in div_elements:
|
|||
|
date_span = div.find('span', class_='date ng-binding')
|
|||
|
if date_span:
|
|||
|
date_text = date_span.text.strip()
|
|||
|
date = datetime.strptime(date_text, "%Y-%m-%d")
|
|||
|
if date >= date_limit:
|
|||
|
link = div.find('a', href=True)
|
|||
|
if link and ("处罚决定" in link.text or "监罚" in link.text):
|
|||
|
title = link.get('title', '')
|
|||
|
href = link['href']
|
|||
|
full_url = base_url + href
|
|||
|
collected_data.append([title, full_url, date_text])
|
|||
|
if len(collected_data) >= batch_size:
|
|||
|
save_data_to_excel(collected_data, output_filename)
|
|||
|
collected_data = [] # Reset the collected data list after saving
|
|||
|
print(f"Batch {batch_number} saved. Continuing to next batch.")
|
|||
|
batch_number += 1
|
|||
|
else:
|
|||
|
should_continue = False
|
|||
|
break
|
|||
|
|
|||
|
if should_continue:
|
|||
|
try:
|
|||
|
next_button = WebDriverWait(driver, 10).until(
|
|||
|
EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']"))
|
|||
|
)
|
|||
|
ActionChains(driver).click(next_button).perform()
|
|||
|
except Exception as e:
|
|||
|
print(f"Failed to navigate to next page: {e}")
|
|||
|
break
|
|||
|
|
|||
|
if collected_data: # Save any remaining data after finishing all pages
|
|||
|
save_data_to_excel(collected_data, output_filename)
|
|||
|
print(f"Final batch saved.")
|
|||
|
|
|||
|
driver.quit()
|
|||
|
print("Data has been saved or appended to extracted_data.xlsx")
|
|||
|
|
|||
|
# Example usage
|
|||
|
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
|||
|
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
|||
|
date_limit = datetime(2024, 8, 20)
|
|||
|
fetch_and_save_data(base_page_url, base_url, date_limit) #默认输出到本目录下的output_filename='extracted_data.xlsx', batch_size=100
|