2024.8.2
This commit is contained in:
parent
06937c6de2
commit
e28e732616
29
Dockerfile
Normal file
29
Dockerfile
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# Use an official Python runtime as a parent image
|
||||||
|
FROM python:3.9-slim
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
chromium-driver \
|
||||||
|
&& apt-get clean \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy the rest of the application code
|
||||||
|
COPY . /reptile
|
||||||
|
WORKDIR /reptile
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||||
|
ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/"
|
||||||
|
ENV DATE_LIMIT="2024-07-24"
|
||||||
|
ENV OUTPUT_PATH_PREFIX="url"
|
||||||
|
|
||||||
|
# Run the Python script
|
||||||
|
CMD ["python", "scrape.py"]
|
@ -5,6 +5,7 @@ from selenium.webdriver.common.action_chains import ActionChains
|
|||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
import os
|
import os
|
||||||
|
import glob
|
||||||
|
|
||||||
def clean_text(html_content):
|
def clean_text(html_content):
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
@ -147,12 +148,18 @@ def create_empty_excel(filename):
|
|||||||
df = pd.DataFrame(columns=columns)
|
df = pd.DataFrame(columns=columns)
|
||||||
df.to_excel(filename, index=False)
|
df.to_excel(filename, index=False)
|
||||||
|
|
||||||
def process_in_batches(urls, batch_size=100):
|
def process_in_batches(url_files_pattern, output_file_prefix, batch_size=100, max_rows_per_file=10000):
|
||||||
|
url_files = glob.glob(url_files_pattern)
|
||||||
|
urls = []
|
||||||
|
for url_file in url_files:
|
||||||
|
with open(url_file, 'r') as file:
|
||||||
|
urls.extend([line.strip() for line in file if line.strip()])
|
||||||
|
|
||||||
total_urls = len(urls)
|
total_urls = len(urls)
|
||||||
num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
|
num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
|
||||||
|
|
||||||
file_index = 1
|
file_index = 1
|
||||||
output_filename = f'output_data{file_index}.xlsx'
|
output_filename = f'{output_file_prefix}{file_index}.xlsx'
|
||||||
rows_in_file = 0
|
rows_in_file = 0
|
||||||
|
|
||||||
if not os.path.exists(output_filename):
|
if not os.path.exists(output_filename):
|
||||||
@ -177,18 +184,17 @@ def process_in_batches(urls, batch_size=100):
|
|||||||
|
|
||||||
rows_in_file += batch_data.shape[0]
|
rows_in_file += batch_data.shape[0]
|
||||||
|
|
||||||
if rows_in_file >= 10000:
|
if rows_in_file >= max_rows_per_file:
|
||||||
file_index += 1
|
file_index += 1
|
||||||
output_filename = f'output_data{file_index}.xlsx'
|
output_filename = f'{output_file_prefix}{file_index}.xlsx'
|
||||||
rows_in_file = 0
|
rows_in_file = 0
|
||||||
|
|
||||||
if not os.path.exists(output_filename):
|
if not os.path.exists(output_filename):
|
||||||
create_empty_excel(output_filename)
|
create_empty_excel(output_filename)
|
||||||
|
|
||||||
# 读取URL列表
|
# Example usage
|
||||||
with open('url1.txt', 'r') as file:
|
url_files_pattern = 'url*.txt' # 匹配所有以 'url' 开头的 txt 文件
|
||||||
urls = [line.strip() for line in file if line.strip()]
|
output_file_prefix = 'output_data'
|
||||||
|
|
||||||
# 分批处理URL并写入Excel
|
process_in_batches(url_files_pattern, output_file_prefix, batch_size=100)
|
||||||
process_in_batches(urls, batch_size=100)
|
print("Data has been appended to the existing Excel files.")
|
||||||
print("Data has been appended to the existing Excel file.")
|
|
||||||
|
19
scrape.py
19
scrape.py
@ -1,3 +1,5 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
@ -113,12 +115,12 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
|||||||
break
|
break
|
||||||
|
|
||||||
# Save URLs if they exceed 2000 and reset unique_urls
|
# Save URLs if they exceed 2000 and reset unique_urls
|
||||||
if len(unique_urls) >= 10:
|
if len(unique_urls) >= 2000:
|
||||||
save_urls_to_file(unique_urls, file_index)
|
save_urls_to_file(unique_urls, file_index)
|
||||||
unique_urls.clear()
|
unique_urls.clear()
|
||||||
|
|
||||||
# If the current file exceeds 20000 URLs, start a new file
|
# If the current file exceeds 20000 URLs, start a new file
|
||||||
if urls_in_current_file >= 20:
|
if urls_in_current_file >= 20000:
|
||||||
file_index += 1
|
file_index += 1
|
||||||
urls_in_current_file = 0
|
urls_in_current_file = 0
|
||||||
|
|
||||||
@ -148,8 +150,13 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
|||||||
|
|
||||||
|
|
||||||
# Example usage
|
# Example usage
|
||||||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||||
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||||
date_limit = datetime(2024, 7, 24)
|
# date_limit = datetime(2024, 7, 24)
|
||||||
output_path_prefix = 'url'
|
# output_path_prefix = 'url'
|
||||||
|
# 从环境变量读取参数
|
||||||
|
base_page_url = os.getenv('BASE_PAGE_URL')
|
||||||
|
base_url = os.getenv('BASE_URL')
|
||||||
|
date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
|
||||||
|
output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
|
||||||
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)
|
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user