2024.8.2
This commit is contained in:
parent
53bca47077
commit
18fc3c5028
11
Dockerfile
11
Dockerfile
@ -10,23 +10,22 @@ RUN sed -i 's|http://deb.debian.org|http://mirrors.aliyun.com|g' /etc/apt/source
|
|||||||
|
|
||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN apt-get update && apt-get install -y --fix-missing \
|
RUN apt-get update && apt-get install -y --fix-missing \
|
||||||
chromium-driver \
|
|
||||||
&& apt-get clean \
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install Python dependencies
|
# Install Python dependencies
|
||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
|
||||||
|
|
||||||
# Copy the rest of the application code
|
# Copy the rest of the application code
|
||||||
COPY . /reptile
|
COPY . /reptile
|
||||||
WORKDIR /reptile
|
WORKDIR /reptile
|
||||||
|
|
||||||
# Set environment variables
|
# Set environment variables
|
||||||
ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
ENV BASE_PAGE_URL=${BASE_PAGE_URL}
|
||||||
ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/"
|
ENV BASE_URL=${BASE_URL}
|
||||||
ENV DATE_LIMIT="2024-07-24"
|
ENV DATE_LIMIT=${DATE_LIMIT}
|
||||||
ENV OUTPUT_PATH_PREFIX="url"
|
ENV OUTPUT_PATH_PREFIX=${OUTPUT_PATH_PREFIX}
|
||||||
|
|
||||||
# Run the Python script
|
# Run the Python script
|
||||||
CMD ["python", "scrape.py"]
|
CMD ["python", "scrape.py"]
|
||||||
|
20
docker-compose.yaml
Normal file
20
docker-compose.yaml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
version: '3'
|
||||||
|
services:
|
||||||
|
chrome:
|
||||||
|
image: selenium/standalone-chrome
|
||||||
|
ports:
|
||||||
|
- "4444:4444"
|
||||||
|
volumes:
|
||||||
|
- /dev/shm:/dev/shm
|
||||||
|
|
||||||
|
reptile:
|
||||||
|
build: .
|
||||||
|
depends_on:
|
||||||
|
- chrome
|
||||||
|
environment:
|
||||||
|
- BASE_PAGE_URL=https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382
|
||||||
|
- BASE_URL=https://www.cbirc.gov.cn/cn/view/pages/
|
||||||
|
- DATE_LIMIT=2024-07-24
|
||||||
|
- OUTPUT_PATH_PREFIX=url
|
||||||
|
volumes:
|
||||||
|
- .:/reptile
|
50
scrape.py
50
scrape.py
@ -17,6 +17,33 @@ def random_wait(min_time=1, max_time=5):
|
|||||||
|
|
||||||
|
|
||||||
# Function to create a new browser session with options to avoid detection
|
# Function to create a new browser session with options to avoid detection
|
||||||
|
# def create_browser():
|
||||||
|
# options = webdriver.ChromeOptions()
|
||||||
|
# options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
|
# options.add_argument("--headless") # Enable headless mode
|
||||||
|
# options.add_argument(
|
||||||
|
# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||||
|
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||||
|
# options.add_experimental_option('useAutomationExtension', False)
|
||||||
|
# driver = webdriver.Chrome(options=options)
|
||||||
|
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||||
|
# 'source': '''
|
||||||
|
# Object.defineProperty(navigator, 'webdriver', {
|
||||||
|
# get: () => undefined
|
||||||
|
# });
|
||||||
|
# window.navigator.chrome = {
|
||||||
|
# runtime: {}
|
||||||
|
# };
|
||||||
|
# Object.defineProperty(navigator, 'languages', {
|
||||||
|
# get: () => ['en-US', 'en']
|
||||||
|
# });
|
||||||
|
# Object.defineProperty(navigator, 'plugins', {
|
||||||
|
# get: () => [1, 2, 3, 4, 5]
|
||||||
|
# });
|
||||||
|
# '''
|
||||||
|
# })
|
||||||
|
# return driver
|
||||||
|
|
||||||
def create_browser():
|
def create_browser():
|
||||||
options = webdriver.ChromeOptions()
|
options = webdriver.ChromeOptions()
|
||||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
@ -25,23 +52,10 @@ def create_browser():
|
|||||||
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||||
options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||||
options.add_experimental_option('useAutomationExtension', False)
|
options.add_experimental_option('useAutomationExtension', False)
|
||||||
driver = webdriver.Chrome(options=options)
|
driver = webdriver.Remote(
|
||||||
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
command_executor='http://chrome:4444/wd/hub',
|
||||||
'source': '''
|
options=options
|
||||||
Object.defineProperty(navigator, 'webdriver', {
|
)
|
||||||
get: () => undefined
|
|
||||||
});
|
|
||||||
window.navigator.chrome = {
|
|
||||||
runtime: {}
|
|
||||||
};
|
|
||||||
Object.defineProperty(navigator, 'languages', {
|
|
||||||
get: () => ['en-US', 'en']
|
|
||||||
});
|
|
||||||
Object.defineProperty(navigator, 'plugins', {
|
|
||||||
get: () => [1, 2, 3, 4, 5]
|
|
||||||
});
|
|
||||||
'''
|
|
||||||
})
|
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
|
|
||||||
@ -152,7 +166,7 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
|||||||
# Example usage
|
# Example usage
|
||||||
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||||
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||||
# date_limit = datetime(2024, 7, 24)
|
# date_limit = datetime(2003, 12, 24)
|
||||||
# output_path_prefix = 'url'
|
# output_path_prefix = 'url'
|
||||||
# 从环境变量读取参数
|
# 从环境变量读取参数
|
||||||
base_page_url = os.getenv('BASE_PAGE_URL')
|
base_page_url = os.getenv('BASE_PAGE_URL')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user