From 18fc3c502803ae7076a87ba636730ec2207acde9 Mon Sep 17 00:00:00 2001 From: zhangsan <646228430@qq.com> Date: Fri, 2 Aug 2024 19:44:26 +0800 Subject: [PATCH] 2024.8.2 --- Dockerfile | 11 +++++----- docker-compose.yaml | 20 ++++++++++++++++++ scrape.py | 50 +++++++++++++++++++++++++++++---------------- 3 files changed, 57 insertions(+), 24 deletions(-) create mode 100644 docker-compose.yaml diff --git a/Dockerfile b/Dockerfile index 5e48a07..f78330d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,23 +10,22 @@ RUN sed -i 's|http://deb.debian.org|http://mirrors.aliyun.com|g' /etc/apt/source # Install dependencies RUN apt-get update && apt-get install -y --fix-missing \ - chromium-driver \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* # Install Python dependencies COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt +RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt # Copy the rest of the application code COPY . /reptile WORKDIR /reptile # Set environment variables -ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382" -ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/" -ENV DATE_LIMIT="2024-07-24" -ENV OUTPUT_PATH_PREFIX="url" +ENV BASE_PAGE_URL=${BASE_PAGE_URL} +ENV BASE_URL=${BASE_URL} +ENV DATE_LIMIT=${DATE_LIMIT} +ENV OUTPUT_PATH_PREFIX=${OUTPUT_PATH_PREFIX} # Run the Python script CMD ["python", "scrape.py"] diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..057c97b --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,20 @@ +version: '3' +services: + chrome: + image: selenium/standalone-chrome + ports: + - "4444:4444" + volumes: + - /dev/shm:/dev/shm + + reptile: + build: . + depends_on: + - chrome + environment: + - BASE_PAGE_URL=https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382 + - BASE_URL=https://www.cbirc.gov.cn/cn/view/pages/ + - DATE_LIMIT=2024-07-24 + - OUTPUT_PATH_PREFIX=url + volumes: + - .:/reptile diff --git a/scrape.py b/scrape.py index a6a49ed..ab65088 100644 --- a/scrape.py +++ b/scrape.py @@ -17,6 +17,33 @@ def random_wait(min_time=1, max_time=5): # Function to create a new browser session with options to avoid detection +# def create_browser(): +# options = webdriver.ChromeOptions() +# options.add_argument("--disable-blink-features=AutomationControlled") +# options.add_argument("--headless") # Enable headless mode +# options.add_argument( +# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") +# options.add_experimental_option('excludeSwitches', ['enable-automation']) +# options.add_experimental_option('useAutomationExtension', False) +# driver = webdriver.Chrome(options=options) +# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { +# 'source': ''' +# Object.defineProperty(navigator, 'webdriver', { +# get: () => undefined +# }); +# window.navigator.chrome = { +# runtime: {} +# }; +# Object.defineProperty(navigator, 'languages', { +# get: () => ['en-US', 'en'] +# }); +# Object.defineProperty(navigator, 'plugins', { +# get: () => [1, 2, 3, 4, 5] +# }); +# ''' +# }) +# return driver + def create_browser(): options = webdriver.ChromeOptions() options.add_argument("--disable-blink-features=AutomationControlled") @@ -25,23 +52,10 @@ def create_browser(): "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_experimental_option('useAutomationExtension', False) - driver = webdriver.Chrome(options=options) - driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { - 'source': ''' - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - window.navigator.chrome = { - runtime: {} - }; - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'] - }); - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5] - }); - ''' - }) + driver = webdriver.Remote( + command_executor='http://chrome:4444/wd/hub', + options=options + ) return driver @@ -152,7 +166,7 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix): # Example usage # base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382" # base_url = 'https://www.cbirc.gov.cn/cn/view/pages/' -# date_limit = datetime(2024, 7, 24) +# date_limit = datetime(2003, 12, 24) # output_path_prefix = 'url' # 从环境变量读取参数 base_page_url = os.getenv('BASE_PAGE_URL')