2024.8.2

2024-08-02 14:39:03 +08:00 · 2024-08-02 14:39:03 +08:00 · e28e732616
commit e28e732616
parent 06937c6de2
5 changed files with 58 additions and 2046 deletions
--- a/29
+++ b/29
@ -0,0 +1,29 @@
+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# Install dependencies
+RUN apt-get update && apt-get install -y \
+    chromium-driver \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application code
+COPY . /reptile
+WORKDIR /reptile
+
+# Set environment variables
+ENV BASE_PAGE_URL="https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
+ENV BASE_URL="https://www.cbirc.gov.cn/cn/view/pages/"
+ENV DATE_LIMIT="2024-07-24"
+ENV OUTPUT_PATH_PREFIX="url"
+
+# Run the Python script
+CMD ["python", "scrape.py"]
--- a/main_extraction.py
+++ b/main_extraction.py
@ -5,6 +5,7 @@ from selenium.webdriver.common.action_chains import ActionChains
 import time
 import random
 import os
+import glob

 def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
@ -147,12 +148,18 @@ def create_empty_excel(filename):
    df = pd.DataFrame(columns=columns)
    df.to_excel(filename, index=False)

-def process_in_batches(urls, batch_size=100):
+def process_in_batches(url_files_pattern, output_file_prefix, batch_size=100, max_rows_per_file=10000):
+    url_files = glob.glob(url_files_pattern)
+    urls = []
+    for url_file in url_files:
+        with open(url_file, 'r') as file:
+            urls.extend([line.strip() for line in file if line.strip()])
+
    total_urls = len(urls)
    num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)

    file_index = 1
-    output_filename = f'output_data{file_index}.xlsx'
+    output_filename = f'{output_file_prefix}{file_index}.xlsx'
    rows_in_file = 0

    if not os.path.exists(output_filename):
@ -177,18 +184,17 @@ def process_in_batches(urls, batch_size=100):

        rows_in_file += batch_data.shape[0]

-        if rows_in_file >= 10000:
+        if rows_in_file >= max_rows_per_file:
            file_index += 1
-            output_filename = f'output_data{file_index}.xlsx'
+            output_filename = f'{output_file_prefix}{file_index}.xlsx'
            rows_in_file = 0

            if not os.path.exists(output_filename):
                create_empty_excel(output_filename)

-# 读取URL列表
-with open('url1.txt', 'r') as file:
-    urls = [line.strip() for line in file if line.strip()]
+# Example usage
+url_files_pattern = 'url*.txt'  # 匹配所有以 'url' 开头的 txt 文件
+output_file_prefix = 'output_data'

-# 分批处理URL并写入Excel
-process_in_batches(urls, batch_size=100)
-print("Data has been appended to the existing Excel file.")
+process_in_batches(url_files_pattern, output_file_prefix, batch_size=100)
+print("Data has been appended to the existing Excel files.")
--- a/scrape.py
+++ b/scrape.py
@ -1,3 +1,5 @@
+import os
+
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.action_chains import ActionChains
@ -113,12 +115,12 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
                    break

        # Save URLs if they exceed 2000 and reset unique_urls
-        if len(unique_urls) >= 10:
+        if len(unique_urls) >= 2000:
            save_urls_to_file(unique_urls, file_index)
            unique_urls.clear()

            # If the current file exceeds 20000 URLs, start a new file
-            if urls_in_current_file >= 20:
+            if urls_in_current_file >= 20000:
                file_index += 1
                urls_in_current_file = 0

@ -148,8 +150,13 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):


 # Example usage
-base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
-base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
-date_limit = datetime(2024, 7, 24)
-output_path_prefix = 'url'
+# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
+# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
+# date_limit = datetime(2024, 7, 24)
+# output_path_prefix = 'url'
+# 从环境变量读取参数
+base_page_url = os.getenv('BASE_PAGE_URL')
+base_url = os.getenv('BASE_URL')
+date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
+output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
 fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)
--- a/url2.txt
+++ b/url2.txt
--- a/urls1.txt
+++ b/urls1.txt