import os import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.action_chains import ActionChains from bs4 import BeautifulSoup import time from datetime import datetime import random def create_empty_excel(filename): """创建一个包含指定列的空Excel文件。""" columns = ["标题", "网址", "日期"] df = pd.DataFrame(columns=columns) df.to_excel(filename, index=False) def save_data_to_excel(data, filename): """将收集的数据追加到Excel文件中。如果文件不存在,先创建文件。""" new_data_df = pd.DataFrame(data, columns=['标题', '网址', '日期']) if not os.path.exists(filename): create_empty_excel(filename) # 读取现有的Excel文件到DataFrame existing_df = pd.read_excel(filename) # 使用concat而不是append来合并数据框 updated_df = pd.concat([existing_df, new_data_df], ignore_index=True) # 将更新后的DataFrame写回到Excel文件,覆盖原有文件 updated_df.to_excel(filename, index=False) def random_wait(min_time=1, max_time=3): time.sleep(random.uniform(min_time, max_time)) def create_browser(): options = webdriver.ChromeOptions() options.add_argument("--disable-blink-features=AutomationControlled") options.add_argument("--headless") options.add_experimental_option('excludeSwitches', ['enable-automation']) driver = webdriver.Chrome(options=options) driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); window.navigator.chrome = { runtime: {} }; Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); ''' }) return driver def fetch_and_save_data(base_page_url, base_url, date_limit, output_filename='extracted_data.xlsx', batch_size=100): driver = create_browser() collected_data = [] driver.get(base_page_url) should_continue = True cur_page = 0 batch_number = 1 while should_continue: cur_page += 1 print(f"Visiting new page: {cur_page}") random_wait() soup = BeautifulSoup(driver.page_source, 'html.parser') div_elements = soup.find_all('div', class_="panel-row ng-scope") for div in div_elements: date_span = div.find('span', class_='date ng-binding') if date_span: date_text = date_span.text.strip() date = datetime.strptime(date_text, "%Y-%m-%d") if date >= date_limit: link = div.find('a', href=True) if link and ("处罚决定" in link.text or "监罚" in link.text): title = link.get('title', '') href = link['href'] full_url = base_url + href collected_data.append([title, full_url, date_text]) if len(collected_data) >= batch_size: save_data_to_excel(collected_data, output_filename) collected_data = [] # Reset the collected data list after saving print(f"Batch {batch_number} saved. Continuing to next batch.") batch_number += 1 else: should_continue = False break if should_continue: try: next_button = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']")) ) ActionChains(driver).click(next_button).perform() except Exception as e: print(f"Failed to navigate to next page: {e}") break if collected_data: # Save any remaining data after finishing all pages save_data_to_excel(collected_data, output_filename) print(f"Final batch saved.") driver.quit() print("Data has been saved or appended to extracted_data.xlsx") # Example usage base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" base_url = 'https://www.cbirc.gov.cn/cn/view/pages/' date_limit = datetime(2024, 8, 20) fetch_and_save_data(base_page_url, base_url, date_limit) #默认输出到本目录下的output_filename='extracted_data.xlsx', batch_size=100