2024.8.2
This commit is contained in:
parent
53bca47077
commit
18fc3c5028
11
Dockerfile
11
Dockerfile
@ -10,23 +10,22 @@ RUN sed -i 's|http://deb.debian.org|http://mirrors.aliyun.com|g' /etc/apt/source
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update && apt-get install -y --fix-missing \
|
||||
chromium-driver \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . /reptile
|
||||
WORKDIR /reptile
|
||||
|
||||
# Set environment variables
|
||||
ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||
ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/"
|
||||
ENV DATE_LIMIT="2024-07-24"
|
||||
ENV OUTPUT_PATH_PREFIX="url"
|
||||
ENV BASE_PAGE_URL=${BASE_PAGE_URL}
|
||||
ENV BASE_URL=${BASE_URL}
|
||||
ENV DATE_LIMIT=${DATE_LIMIT}
|
||||
ENV OUTPUT_PATH_PREFIX=${OUTPUT_PATH_PREFIX}
|
||||
|
||||
# Run the Python script
|
||||
CMD ["python", "scrape.py"]
|
||||
|
20
docker-compose.yaml
Normal file
20
docker-compose.yaml
Normal file
@ -0,0 +1,20 @@
|
||||
version: '3'
|
||||
services:
|
||||
chrome:
|
||||
image: selenium/standalone-chrome
|
||||
ports:
|
||||
- "4444:4444"
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm
|
||||
|
||||
reptile:
|
||||
build: .
|
||||
depends_on:
|
||||
- chrome
|
||||
environment:
|
||||
- BASE_PAGE_URL=https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382
|
||||
- BASE_URL=https://www.cbirc.gov.cn/cn/view/pages/
|
||||
- DATE_LIMIT=2024-07-24
|
||||
- OUTPUT_PATH_PREFIX=url
|
||||
volumes:
|
||||
- .:/reptile
|
50
scrape.py
50
scrape.py
@ -17,6 +17,33 @@ def random_wait(min_time=1, max_time=5):
|
||||
|
||||
|
||||
# Function to create a new browser session with options to avoid detection
|
||||
# def create_browser():
|
||||
# options = webdriver.ChromeOptions()
|
||||
# options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
# options.add_argument("--headless") # Enable headless mode
|
||||
# options.add_argument(
|
||||
# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
# options.add_experimental_option('useAutomationExtension', False)
|
||||
# driver = webdriver.Chrome(options=options)
|
||||
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
# 'source': '''
|
||||
# Object.defineProperty(navigator, 'webdriver', {
|
||||
# get: () => undefined
|
||||
# });
|
||||
# window.navigator.chrome = {
|
||||
# runtime: {}
|
||||
# };
|
||||
# Object.defineProperty(navigator, 'languages', {
|
||||
# get: () => ['en-US', 'en']
|
||||
# });
|
||||
# Object.defineProperty(navigator, 'plugins', {
|
||||
# get: () => [1, 2, 3, 4, 5]
|
||||
# });
|
||||
# '''
|
||||
# })
|
||||
# return driver
|
||||
|
||||
def create_browser():
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
@ -25,23 +52,10 @@ def create_browser():
|
||||
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
options.add_experimental_option('useAutomationExtension', False)
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
window.navigator.chrome = {
|
||||
runtime: {}
|
||||
};
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
});
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
});
|
||||
'''
|
||||
})
|
||||
driver = webdriver.Remote(
|
||||
command_executor='http://chrome:4444/wd/hub',
|
||||
options=options
|
||||
)
|
||||
return driver
|
||||
|
||||
|
||||
@ -152,7 +166,7 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
||||
# Example usage
|
||||
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||
# date_limit = datetime(2024, 7, 24)
|
||||
# date_limit = datetime(2003, 12, 24)
|
||||
# output_path_prefix = 'url'
|
||||
# 从环境变量读取参数
|
||||
base_page_url = os.getenv('BASE_PAGE_URL')
|
||||
|
Loading…
x
Reference in New Issue
Block a user