reptile/scrape.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
from datetime import datetime
import random


# Function to generate a random waiting time
def random_wait(min_time=1, max_time=5):
    time.sleep(random.uniform(min_time, max_time))


# Function to create a new browser session with options to avoid detection
def create_browser():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    # options.add_argument("--headless")  # Uncomment this line to use headless mode
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': '''
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            });
            window.navigator.chrome = {
                runtime: {}
            };
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en']
            });
            Object.defineProperty(navigator, 'plugins', {
                get: () => [1, 2, 3, 4, 5]
            });
        '''
    })
    return driver


# Initialize WebDriver
driver = create_browser()

# Base URL information
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'

# Set to store unique URLs
unique_urls = set()


# Function to check date
def is_date_valid(date_text):
    given_date = datetime.strptime(date_text, "%Y-%m-%d")
    return given_date >= datetime(2023, 6, 1)


# Visit the initial page
driver.get(base_page_url)
cur_page = 0

# Keep processing until a date before June 1, 2023, is found
while True:
    cur_page += 1
    print("Visiting new page:" + str(cur_page))

    # Wait for JavaScript to load
    random_wait()

    # Get the page source after JS execution
    html = driver.page_source

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Find all <div> elements that match class conditions
    div_elements = soup.find_all('div', class_="panel-row ng-scope")

    # Variable to determine if loop should continue
    should_continue = False

    # Iterate through the div elements to find links and dates
    for div in div_elements:
        date_span = div.find('span', class_='date ng-binding')
        if date_span:
            date_text = date_span.text.strip()
            if is_date_valid(date_text):
                should_continue = True
                link = div.find('a', href=True, attrs={'ng-bind-html': 'x.docSubtitle|trustHtml'})
                if link and "处罚信息公开表" in link.text:
                    href = link['href']
                    full_url = base_url + href
                    if "//cn/view/pages/" not in full_url:
                        unique_urls.add(full_url)
            else:
                # Since this date is invalid and dates are sorted in descending order, no need to continue
                should_continue = False
                break

    # Check if loop should continue
    if not should_continue:
        break

    # Try to find and click the next page button
    try:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']"))
        )
        ActionChains(driver).move_to_element(next_button).click().perform()
        random_wait()  # Wait for the next page to load
    except Exception as e:
        print("No more pages or error occurred:", e)
        break

# Close the browser
driver.quit()

# Print all unique URLs and count them
cnt = 0

# Open a file to write
with open('urls.txt', 'w') as file:
    for url in unique_urls:
        cnt += 1
        file.write(url + '\n')  # Write each URL followed by a newline

print("URLs have been saved to urls1.txt")
print("Total URLs found:", cnt)