2024.8.2

2024-08-02 14:39:03 +08:00 · 2024-08-02 14:39:03 +08:00 · e28e732616
commit e28e732616
parent 06937c6de2
5 changed files with 58 additions and 2046 deletions
--- a/29
+++ b/29
@ -0,0 +1,29 @@
 # Use an official Python runtime as a parent image
 FROM python:3.9-slim
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 # Install dependencies
 RUN apt-get update && apt-get install -y \
    chromium-driver \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy the rest of the application code
 COPY . /reptile
 WORKDIR /reptile
 # Set environment variables
 ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
 ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/"
 ENV DATE_LIMIT="2024-07-24"
 ENV OUTPUT_PATH_PREFIX="url"
 # Run the Python script
 CMD ["python", "scrape.py"]
--- a/main_extraction.py
+++ b/main_extraction.py
@ -5,6 +5,7 @@ from selenium.webdriver.common.action_chains import ActionChains
 import time
 import random
 import os
 import glob
 def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
@ -147,12 +148,18 @@ def create_empty_excel(filename):
    df = pd.DataFrame(columns=columns)
    df.to_excel(filename, index=False)
-def process_in_batches(urls, batch_size=100):
+def process_in_batches(url_files_pattern, output_file_prefix, batch_size=100, max_rows_per_file=10000):
    url_files = glob.glob(url_files_pattern)
    urls = []
    for url_file in url_files:
        with open(url_file, 'r') as file:
            urls.extend([line.strip() for line in file if line.strip()])
    total_urls = len(urls)
    num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
    file_index = 1
-    output_filename = f'output_data{file_index}.xlsx'
+    output_filename = f'{output_file_prefix}{file_index}.xlsx'
    rows_in_file = 0
    if not os.path.exists(output_filename):
@ -177,18 +184,17 @@ def process_in_batches(urls, batch_size=100):
        rows_in_file += batch_data.shape[0]
-        if rows_in_file >= 10000:
+        if rows_in_file >= max_rows_per_file:
            file_index += 1
-            output_filename = f'output_data{file_index}.xlsx'
+            output_filename = f'{output_file_prefix}{file_index}.xlsx'
            rows_in_file = 0
            if not os.path.exists(output_filename):
                create_empty_excel(output_filename)
-# 读取URL列表
+# Example usage
-with open('url1.txt', 'r') as file:
+url_files_pattern = 'url*.txt'  # 匹配所有以 'url' 开头的 txt 文件
-    urls = [line.strip() for line in file if line.strip()]
+output_file_prefix = 'output_data'
-# 分批处理URL并写入Excel
+process_in_batches(url_files_pattern, output_file_prefix, batch_size=100)
-process_in_batches(urls, batch_size=100)
+print("Data has been appended to the existing Excel files.")
 print("Data has been appended to the existing Excel file.")
--- a/scrape.py
+++ b/scrape.py
@ -1,3 +1,5 @@
 import os
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.action_chains import ActionChains
@ -113,12 +115,12 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
                    break
        # Save URLs if they exceed 2000 and reset unique_urls
-        if len(unique_urls) >= 10:
+        if len(unique_urls) >= 2000:
            save_urls_to_file(unique_urls, file_index)
            unique_urls.clear()
            # If the current file exceeds 20000 URLs, start a new file
-            if urls_in_current_file >= 20:
+            if urls_in_current_file >= 20000:
                file_index += 1
                urls_in_current_file = 0
@ -148,8 +150,13 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
 # Example usage
-base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
+# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
-base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
+# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
-date_limit = datetime(2024, 7, 24)
+# date_limit = datetime(2024, 7, 24)
-output_path_prefix = 'url'
+# output_path_prefix = 'url'
 # 从环境变量读取参数
 base_page_url = os.getenv('BASE_PAGE_URL')
 base_url = os.getenv('BASE_URL')
 date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
 output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
 fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)
--- a/url2.txt
+++ b/url2.txt
--- a/urls1.txt
+++ b/urls1.txt