from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from bs4 import BeautifulSoup import time from datetime import datetime import random # Function to generate a random waiting time def random_wait(min_time=1, max_time=5): time.sleep(random.uniform(min_time, max_time)) # Function to create a new browser session with options to avoid detection def create_browser(): options = webdriver.ChromeOptions() options.add_argument("--disable-blink-features=AutomationControlled") # options.add_argument("--headless") # Uncomment this line to use headless mode options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_experimental_option('useAutomationExtension', False) driver = webdriver.Chrome(options=options) driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { 'source': ''' Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); window.navigator.chrome = { runtime: {} }; Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] }); ''' }) return driver # Initialize WebDriver driver = create_browser() #"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" # Base URL information #"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" base_url = 'https://www.cbirc.gov.cn/cn/view/pages/' # Set to store unique URLs unique_urls = set() # Function to check date def is_date_valid(date_text): given_date = datetime.strptime(date_text, "%Y-%m-%d") return given_date >= datetime(2023, 11, 28) # Visit the initial page driver.get(base_page_url) cur_page = 0 # Keep processing until a date before June 1, 2023, is found while True: cur_page += 1 print("Visiting new page:" + str(cur_page)) # Wait for JavaScript to load random_wait() # Get the page source after JS execution html = driver.page_source # Parse the HTML using BeautifulSoup soup = BeautifulSoup(html, 'html.parser') # Find all
elements that match class conditions div_elements = soup.find_all('div', class_="panel-row ng-scope") # Variable to determine if loop should continue should_continue = False # Iterate through the div elements to find links and dates for div in div_elements: date_span = div.find('span', class_='date ng-binding') if date_span: date_text = date_span.text.strip() if is_date_valid(date_text): should_continue = True link = div.find('a', href=True, attrs={'ng-bind-html': 'x.docSubtitle|trustHtml'}) if link and "处罚信息公开表" in link.text: href = link['href'] full_url = base_url + href if "//cn/view/pages/" not in full_url: unique_urls.add(full_url) else: # Since this date is invalid and dates are sorted in descending order, no need to continue should_continue = False break # Check if loop should continue if not should_continue: break # Try to find and click the next page button try: next_button = WebDriverWait(driver, 10).until( EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']")) ) ActionChains(driver).move_to_element(next_button).click().perform() random_wait() # Wait for the next page to load except Exception as e: print("No more pages or error occurred:", e) break # Close the browser driver.quit() # Print all unique URLs and count them cnt = 0 # Open a file to write with open('url2.txt', 'w') as file: for url in unique_urls: cnt += 1 file.write(url + '\n') # Write each URL followed by a newline print("URLs have been saved to urls1.txt") print("Total URLs found:", cnt)