2024.8.2
This commit is contained in:
parent
06937c6de2
commit
e28e732616
29
Dockerfile
Normal file
29
Dockerfile
Normal file
@ -0,0 +1,29 @@
|
||||
# Use an official Python runtime as a parent image
|
||||
FROM python:3.9-slim
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONDONTWRITEBYTECODE=1
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
# Install dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
chromium-driver \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . /reptile
|
||||
WORKDIR /reptile
|
||||
|
||||
# Set environment variables
|
||||
ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||
ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/"
|
||||
ENV DATE_LIMIT="2024-07-24"
|
||||
ENV OUTPUT_PATH_PREFIX="url"
|
||||
|
||||
# Run the Python script
|
||||
CMD ["python", "scrape.py"]
|
@ -5,6 +5,7 @@ from selenium.webdriver.common.action_chains import ActionChains
|
||||
import time
|
||||
import random
|
||||
import os
|
||||
import glob
|
||||
|
||||
def clean_text(html_content):
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
@ -147,12 +148,18 @@ def create_empty_excel(filename):
|
||||
df = pd.DataFrame(columns=columns)
|
||||
df.to_excel(filename, index=False)
|
||||
|
||||
def process_in_batches(urls, batch_size=100):
|
||||
def process_in_batches(url_files_pattern, output_file_prefix, batch_size=100, max_rows_per_file=10000):
|
||||
url_files = glob.glob(url_files_pattern)
|
||||
urls = []
|
||||
for url_file in url_files:
|
||||
with open(url_file, 'r') as file:
|
||||
urls.extend([line.strip() for line in file if line.strip()])
|
||||
|
||||
total_urls = len(urls)
|
||||
num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
|
||||
|
||||
file_index = 1
|
||||
output_filename = f'output_data{file_index}.xlsx'
|
||||
output_filename = f'{output_file_prefix}{file_index}.xlsx'
|
||||
rows_in_file = 0
|
||||
|
||||
if not os.path.exists(output_filename):
|
||||
@ -177,18 +184,17 @@ def process_in_batches(urls, batch_size=100):
|
||||
|
||||
rows_in_file += batch_data.shape[0]
|
||||
|
||||
if rows_in_file >= 10000:
|
||||
if rows_in_file >= max_rows_per_file:
|
||||
file_index += 1
|
||||
output_filename = f'output_data{file_index}.xlsx'
|
||||
output_filename = f'{output_file_prefix}{file_index}.xlsx'
|
||||
rows_in_file = 0
|
||||
|
||||
if not os.path.exists(output_filename):
|
||||
create_empty_excel(output_filename)
|
||||
|
||||
# 读取URL列表
|
||||
with open('url1.txt', 'r') as file:
|
||||
urls = [line.strip() for line in file if line.strip()]
|
||||
# Example usage
|
||||
url_files_pattern = 'url*.txt' # 匹配所有以 'url' 开头的 txt 文件
|
||||
output_file_prefix = 'output_data'
|
||||
|
||||
# 分批处理URL并写入Excel
|
||||
process_in_batches(urls, batch_size=100)
|
||||
print("Data has been appended to the existing Excel file.")
|
||||
process_in_batches(url_files_pattern, output_file_prefix, batch_size=100)
|
||||
print("Data has been appended to the existing Excel files.")
|
||||
|
19
scrape.py
19
scrape.py
@ -1,3 +1,5 @@
|
||||
import os
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
@ -113,12 +115,12 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
||||
break
|
||||
|
||||
# Save URLs if they exceed 2000 and reset unique_urls
|
||||
if len(unique_urls) >= 10:
|
||||
if len(unique_urls) >= 2000:
|
||||
save_urls_to_file(unique_urls, file_index)
|
||||
unique_urls.clear()
|
||||
|
||||
# If the current file exceeds 20000 URLs, start a new file
|
||||
if urls_in_current_file >= 20:
|
||||
if urls_in_current_file >= 20000:
|
||||
file_index += 1
|
||||
urls_in_current_file = 0
|
||||
|
||||
@ -148,8 +150,13 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
||||
|
||||
|
||||
# Example usage
|
||||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||
date_limit = datetime(2024, 7, 24)
|
||||
output_path_prefix = 'url'
|
||||
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||
# date_limit = datetime(2024, 7, 24)
|
||||
# output_path_prefix = 'url'
|
||||
# 从环境变量读取参数
|
||||
base_page_url = os.getenv('BASE_PAGE_URL')
|
||||
base_url = os.getenv('BASE_URL')
|
||||
date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
|
||||
output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
|
||||
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)
|
||||
|
Loading…
x
Reference in New Issue
Block a user