171 lines
6.6 KiB
Python
171 lines
6.6 KiB
Python
from selenium import webdriver
|
||
from selenium.webdriver.common.by import By
|
||
from selenium.webdriver.common.action_chains import ActionChains
|
||
from selenium.webdriver.support.ui import WebDriverWait
|
||
from selenium.webdriver.support import expected_conditions as EC
|
||
from bs4 import BeautifulSoup
|
||
import time
|
||
from datetime import datetime
|
||
import random
|
||
|
||
|
||
# Function to generate a random waiting time
|
||
def random_wait(min_time=1, max_time=5):
|
||
time.sleep(random.uniform(min_time, max_time))
|
||
|
||
|
||
# Function to create a new browser session with options to avoid detection
|
||
def create_browser(): #适合本地环境
|
||
options = webdriver.ChromeOptions()
|
||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||
options.add_argument("--headless") # Enable headless mode
|
||
options.add_argument(
|
||
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||
options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||
options.add_experimental_option('useAutomationExtension', False)
|
||
driver = webdriver.Chrome(options=options)
|
||
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||
'source': '''
|
||
Object.defineProperty(navigator, 'webdriver', {
|
||
get: () => undefined
|
||
});
|
||
window.navigator.chrome = {
|
||
runtime: {}
|
||
};
|
||
Object.defineProperty(navigator, 'languages', {
|
||
get: () => ['en-US', 'en']
|
||
});
|
||
Object.defineProperty(navigator, 'plugins', {
|
||
get: () => [1, 2, 3, 4, 5]
|
||
});
|
||
'''
|
||
})
|
||
return driver
|
||
|
||
# def create_browser(): #适合在docker环境中
|
||
# options = webdriver.ChromeOptions()
|
||
# options.add_argument("--disable-blink-features=AutomationControlled")
|
||
# options.add_argument("--headless") # Enable headless mode
|
||
# options.add_argument(
|
||
# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||
# options.add_experimental_option('useAutomationExtension', False)
|
||
# driver = webdriver.Remote(
|
||
# command_executor='http://chrome:4444/wd/hub',
|
||
# options=options
|
||
# )
|
||
# return driver
|
||
|
||
|
||
def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
||
# Initialize WebDriver
|
||
driver = create_browser()
|
||
|
||
# Set to store unique URLs
|
||
unique_urls = set()
|
||
total_urls_saved = 0 # Total count of URLs saved
|
||
urls_in_current_file = 0 # Count of URLs in the current file
|
||
|
||
# Function to check date
|
||
def is_date_valid(date_text):
|
||
given_date = datetime.strptime(date_text, "%Y-%m-%d")
|
||
return given_date >= date_limit
|
||
|
||
# Function to save URLs to file
|
||
def save_urls_to_file(urls, file_index):
|
||
nonlocal total_urls_saved
|
||
nonlocal urls_in_current_file
|
||
|
||
with open(f"{output_path_prefix}{file_index}.txt", 'a') as file:
|
||
for url in urls:
|
||
file.write(url + '\n')
|
||
total_urls_saved += 1
|
||
urls_in_current_file += 1
|
||
print(f"URLs have been saved to {output_path_prefix}{file_index}.txt")
|
||
|
||
# Visit the initial page
|
||
driver.get(base_page_url)
|
||
cur_page = 0
|
||
file_index = 1
|
||
|
||
# Keep processing until a date before the given date_limit is found
|
||
while True:
|
||
cur_page += 1
|
||
print("Visiting new page:" + str(cur_page))
|
||
|
||
# Wait for JavaScript to load
|
||
random_wait()
|
||
|
||
# Get the page source after JS execution
|
||
html = driver.page_source
|
||
|
||
# Parse the HTML using BeautifulSoup
|
||
soup = BeautifulSoup(html, 'html.parser')
|
||
|
||
# Find all <div> elements that match class conditions
|
||
div_elements = soup.find_all('div', class_="panel-row ng-scope")
|
||
|
||
# Variable to determine if loop should continue
|
||
should_continue = False
|
||
|
||
# Iterate through the div elements to find links and dates
|
||
for div in div_elements:
|
||
date_span = div.find('span', class_='date ng-binding')
|
||
if date_span:
|
||
date_text = date_span.text.strip()
|
||
if is_date_valid(date_text):
|
||
should_continue = True
|
||
link = div.find('a', href=True, attrs={'ng-bind-html': 'x.docSubtitle|trustHtml'})
|
||
if link and "处罚信息公开表" in link.text:
|
||
href = link['href']
|
||
full_url = base_url + href
|
||
if "//cn/view/pages/" not in full_url:
|
||
unique_urls.add(full_url)
|
||
else:
|
||
# Since this date is invalid and dates are sorted in descending order, no need to continue
|
||
should_continue = False
|
||
break
|
||
|
||
# Save URLs if they exceed 2000 and reset unique_urls
|
||
if len(unique_urls) >= 2000:
|
||
save_urls_to_file(unique_urls, file_index)
|
||
unique_urls.clear()
|
||
|
||
# If the current file exceeds 20000 URLs, start a new file
|
||
if urls_in_current_file >= 20000:
|
||
file_index += 1
|
||
urls_in_current_file = 0
|
||
|
||
# Check if loop should continue
|
||
if not should_continue:
|
||
break
|
||
|
||
# Try to find and click the next page button
|
||
try:
|
||
next_button = WebDriverWait(driver, 10).until(
|
||
EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']"))
|
||
)
|
||
ActionChains(driver).move_to_element(next_button).click().perform()
|
||
random_wait() # Wait for the next page to load
|
||
except Exception as e:
|
||
print("No more pages or error occurred:", e)
|
||
break
|
||
|
||
# Save remaining URLs if any
|
||
if unique_urls:
|
||
save_urls_to_file(unique_urls, file_index)
|
||
|
||
# Close the browser
|
||
driver.quit()
|
||
|
||
print("Total URLs saved:", total_urls_saved)
|
||
|
||
|
||
# Example usage
|
||
#base_page_url是你要提取的网址的首页,脚本会自动进行翻页。
|
||
#每个txt保存20000条url,txt命名为url1 url2 url3 ...
|
||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||
date_limit = datetime(2024, 8, 20)
|
||
output_path_prefix = 'url'
|
||
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix) |