reptile/标题网址提取.py
2024-08-22 19:21:03 +08:00

122 lines
4.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time
from datetime import datetime
import random
def create_empty_excel(filename):
"""创建一个包含指定列的空Excel文件。"""
columns = ["标题", "网址", "日期"]
df = pd.DataFrame(columns=columns)
df.to_excel(filename, index=False)
def save_data_to_excel(data, filename):
"""将收集的数据追加到Excel文件中。如果文件不存在先创建文件。"""
new_data_df = pd.DataFrame(data, columns=['标题', '网址', '日期'])
if not os.path.exists(filename):
create_empty_excel(filename)
# 读取现有的Excel文件到DataFrame
existing_df = pd.read_excel(filename)
# 使用concat而不是append来合并数据框
updated_df = pd.concat([existing_df, new_data_df], ignore_index=True)
# 将更新后的DataFrame写回到Excel文件覆盖原有文件
updated_df.to_excel(filename, index=False)
def random_wait(min_time=1, max_time=3):
time.sleep(random.uniform(min_time, max_time))
def create_browser():
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--headless")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
window.navigator.chrome = {
runtime: {}
};
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
'''
})
return driver
def fetch_and_save_data(base_page_url, base_url, date_limit, output_filename='extracted_data.xlsx', batch_size=100):
driver = create_browser()
collected_data = []
driver.get(base_page_url)
should_continue = True
cur_page = 0
batch_number = 1
while should_continue:
cur_page += 1
print(f"Visiting new page: {cur_page}")
random_wait()
soup = BeautifulSoup(driver.page_source, 'html.parser')
div_elements = soup.find_all('div', class_="panel-row ng-scope")
for div in div_elements:
date_span = div.find('span', class_='date ng-binding')
if date_span:
date_text = date_span.text.strip()
date = datetime.strptime(date_text, "%Y-%m-%d")
if date >= date_limit:
link = div.find('a', href=True)
if link and ("处罚决定" in link.text or "监罚" in link.text):
title = link.get('title', '')
href = link['href']
full_url = base_url + href
collected_data.append([title, full_url, date_text])
if len(collected_data) >= batch_size:
save_data_to_excel(collected_data, output_filename)
collected_data = [] # Reset the collected data list after saving
print(f"Batch {batch_number} saved. Continuing to next batch.")
batch_number += 1
else:
should_continue = False
break
if should_continue:
try:
next_button = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']"))
)
ActionChains(driver).click(next_button).perform()
except Exception as e:
print(f"Failed to navigate to next page: {e}")
break
if collected_data: # Save any remaining data after finishing all pages
save_data_to_excel(collected_data, output_filename)
print(f"Final batch saved.")
driver.quit()
print("Data has been saved or appended to extracted_data.xlsx")
# Example usage
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
date_limit = datetime(2024, 8, 20)
fetch_and_save_data(base_page_url, base_url, date_limit) #默认输出到本目录下的output_filename='extracted_data.xlsx', batch_size=100