from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import time from datetime import datetime import random # Function to generate a random waiting time def random_wait(min_time=1, max_time=5): time.sleep(random.uniform(min_time, max_time)) # Function to create a new browser session with options to avoid detection def create_browser(): options = webdriver.ChromeOptions() options.add_argument("--disable-blink-features=AutomationControlled") # options.add_argument("--headless") # Uncomment this line to use headless mode options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_experimental_option('useAutomationExtension', False) driver = webdriver.Chrome(options=options) driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); window.navigator.chrome = { runtime: {} }; Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); ''' }) return driver # Initialize WebDriver driver = create_browser() #"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" # Base URL information #"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" base_url = 'https://www.cbirc.gov.cn/cn/view/pages/' # Set to store unique URLs unique_urls = set() # Function to check date def is_date_valid(date_text): given_date = datetime.strptime(date_text, "%Y-%m-%d") return given_date >= datetime(2023, 11, 28) # Visit the initial page driver.get(base_page_url) cur_page = 0 # Keep processing until a date before June 1, 2023, is found while True: cur_page += 1 print("Visiting new page:" + str(cur_page)) # Wait for JavaScript to load random_wait() # Get the page source after JS execution html = driver.page_source # Parse the HTML using BeautifulSoup soup = BeautifulSoup(html, 'html.parser') # Find all