This commit is contained in:
zhangsan 2024-08-02 19:44:26 +08:00
parent 53bca47077
commit 18fc3c5028
3 changed files with 57 additions and 24 deletions

View File

@ -10,23 +10,22 @@ RUN sed -i 's|http://deb.debian.org|http://mirrors.aliyun.com|g' /etc/apt/source
# Install dependencies
RUN apt-get update && apt-get install -y --fix-missing \
chromium-driver \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
# Copy the rest of the application code
COPY . /reptile
WORKDIR /reptile
# Set environment variables
ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/"
ENV DATE_LIMIT="2024-07-24"
ENV OUTPUT_PATH_PREFIX="url"
ENV BASE_PAGE_URL=${BASE_PAGE_URL}
ENV BASE_URL=${BASE_URL}
ENV DATE_LIMIT=${DATE_LIMIT}
ENV OUTPUT_PATH_PREFIX=${OUTPUT_PATH_PREFIX}
# Run the Python script
CMD ["python", "scrape.py"]

20
docker-compose.yaml Normal file
View File

@ -0,0 +1,20 @@
version: '3'
services:
chrome:
image: selenium/standalone-chrome
ports:
- "4444:4444"
volumes:
- /dev/shm:/dev/shm
reptile:
build: .
depends_on:
- chrome
environment:
- BASE_PAGE_URL=https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382
- BASE_URL=https://www.cbirc.gov.cn/cn/view/pages/
- DATE_LIMIT=2024-07-24
- OUTPUT_PATH_PREFIX=url
volumes:
- .:/reptile

View File

@ -17,6 +17,33 @@ def random_wait(min_time=1, max_time=5):
# Function to create a new browser session with options to avoid detection
# def create_browser():
# options = webdriver.ChromeOptions()
# options.add_argument("--disable-blink-features=AutomationControlled")
# options.add_argument("--headless") # Enable headless mode
# options.add_argument(
# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# options.add_experimental_option('useAutomationExtension', False)
# driver = webdriver.Chrome(options=options)
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
# 'source': '''
# Object.defineProperty(navigator, 'webdriver', {
# get: () => undefined
# });
# window.navigator.chrome = {
# runtime: {}
# };
# Object.defineProperty(navigator, 'languages', {
# get: () => ['en-US', 'en']
# });
# Object.defineProperty(navigator, 'plugins', {
# get: () => [1, 2, 3, 4, 5]
# });
# '''
# })
# return driver
def create_browser():
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
@ -25,23 +52,10 @@ def create_browser():
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
window.navigator.chrome = {
runtime: {}
};
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
'''
})
driver = webdriver.Remote(
command_executor='http://chrome:4444/wd/hub',
options=options
)
return driver
@ -152,7 +166,7 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
# Example usage
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
# date_limit = datetime(2024, 7, 24)
# date_limit = datetime(2003, 12, 24)
# output_path_prefix = 'url'
# 从环境变量读取参数
base_page_url = os.getenv('BASE_PAGE_URL')