This commit is contained in:
zhangsan 2024-08-02 14:39:03 +08:00
parent 06937c6de2
commit e28e732616
5 changed files with 58 additions and 2046 deletions

29
Dockerfile Normal file
View File

@ -0,0 +1,29 @@
# Use an official Python runtime as a parent image
FROM python:3.9-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# Install dependencies
RUN apt-get update && apt-get install -y \
chromium-driver \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy the rest of the application code
COPY . /reptile
WORKDIR /reptile
# Set environment variables
ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/"
ENV DATE_LIMIT="2024-07-24"
ENV OUTPUT_PATH_PREFIX="url"
# Run the Python script
CMD ["python", "scrape.py"]

View File

@ -5,6 +5,7 @@ from selenium.webdriver.common.action_chains import ActionChains
import time
import random
import os
import glob
def clean_text(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
@ -147,12 +148,18 @@ def create_empty_excel(filename):
df = pd.DataFrame(columns=columns)
df.to_excel(filename, index=False)
def process_in_batches(urls, batch_size=100):
def process_in_batches(url_files_pattern, output_file_prefix, batch_size=100, max_rows_per_file=10000):
url_files = glob.glob(url_files_pattern)
urls = []
for url_file in url_files:
with open(url_file, 'r') as file:
urls.extend([line.strip() for line in file if line.strip()])
total_urls = len(urls)
num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
file_index = 1
output_filename = f'output_data{file_index}.xlsx'
output_filename = f'{output_file_prefix}{file_index}.xlsx'
rows_in_file = 0
if not os.path.exists(output_filename):
@ -177,18 +184,17 @@ def process_in_batches(urls, batch_size=100):
rows_in_file += batch_data.shape[0]
if rows_in_file >= 10000:
if rows_in_file >= max_rows_per_file:
file_index += 1
output_filename = f'output_data{file_index}.xlsx'
output_filename = f'{output_file_prefix}{file_index}.xlsx'
rows_in_file = 0
if not os.path.exists(output_filename):
create_empty_excel(output_filename)
# 读取URL列表
with open('url1.txt', 'r') as file:
urls = [line.strip() for line in file if line.strip()]
# Example usage
url_files_pattern = 'url*.txt' # 匹配所有以 'url' 开头的 txt 文件
output_file_prefix = 'output_data'
# 分批处理URL并写入Excel
process_in_batches(urls, batch_size=100)
print("Data has been appended to the existing Excel file.")
process_in_batches(url_files_pattern, output_file_prefix, batch_size=100)
print("Data has been appended to the existing Excel files.")

View File

@ -1,3 +1,5 @@
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
@ -113,12 +115,12 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
break
# Save URLs if they exceed 2000 and reset unique_urls
if len(unique_urls) >= 10:
if len(unique_urls) >= 2000:
save_urls_to_file(unique_urls, file_index)
unique_urls.clear()
# If the current file exceeds 20000 URLs, start a new file
if urls_in_current_file >= 20:
if urls_in_current_file >= 20000:
file_index += 1
urls_in_current_file = 0
@ -148,8 +150,13 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
# Example usage
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
date_limit = datetime(2024, 7, 24)
output_path_prefix = 'url'
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
# date_limit = datetime(2024, 7, 24)
# output_path_prefix = 'url'
# 从环境变量读取参数
base_page_url = os.getenv('BASE_PAGE_URL')
base_url = os.getenv('BASE_URL')
date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)

2030
url2.txt

File diff suppressed because it is too large Load Diff

View File