2024.8.2

2024-08-02 14:20:40 +08:00 · 2024-08-02 14:20:40 +08:00 · 06937c6de2
commit 06937c6de2
parent bc540c7be8
6 changed files with 178 additions and 121 deletions
--- a/README.md
+++ b/README.md
@ -1,2 +1,2 @@
-ww
-111
+scrape.py脚本负责抓取网址
+main_extraction.py负责抓取网页内容
--- a/error_urls.txt
+++ b/error_urls.txt
--- a/main_extraction.py
+++ b/main_extraction.py
@ -4,19 +4,17 @@ from selenium import webdriver
 from selenium.webdriver.common.action_chains import ActionChains
 import time
 import random
-
+import os

 def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    lines = []
    for p in paragraphs:
-        # 如果 span 有子元素，比如 <span>（一）</span>，就忽略 span 标签
        line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)])
        lines.append(line)
    return '\n'.join(lines).strip()

-
 def process_table(table_rows):
    results = {
        "行政处罚决定书文号": "",
@ -53,30 +51,20 @@ def process_table(table_rows):
            results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1]))
            results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
            results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
-        #TODO:键固定，值动态
        else:
-            temp_dict = {}
-            for row in table_rows:
-                columns = row.find_all('td')
-                if len(columns) >= 2:
-                    header = columns[0].get_text(strip=True)
-                    if "违法违规" in header:
-                        header = "主要违法违规事实"
-                    if "机关名称" in header:
-                        header = "作出处罚决定的机关名称"
-                    if "日期" in header:
-                        header = "作出处罚决定的日期"
-                    content_html = str(columns[1])
-                    content = clean_text(content_html)
-                    temp_dict[header] = content
-            results = temp_dict
+            results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1]))
+            results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1]))
+            results["主要违法违规事实"] = clean_text(str(table_rows[2].find_all_next('td')[1]))
+            results["行政处罚依据"] = clean_text(str(table_rows[3].find_all_next('td')[1]))
+            results["行政处罚决定"] = clean_text(str(table_rows[4].find_all_next('td')[1]))
+            results["作出处罚决定的机关名称"]  = clean_text(str(table_rows[5].find_all_next('td')[1]))
+            results["作出处罚决定的日期"] = clean_text(str(table_rows[6].find_all_next('td')[1]))

    except Exception as e:
        print(f"Error processing table: {e}")

    return results

-
 def fetch_data(urls):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # 使用无头模式
@ -114,14 +102,13 @@ def fetch_data(urls):
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')

-            # 尝试不同的选择器
            selectors = [
                '.Section0 .MsoNormalTable, .Section0 .MsoTableGrid',
                '.Section1 .MsoNormalTable, .Section1 .MsoTableGrid',
                '.WordSection1 .MsoNormalTable, .WordSection1 .MsoTableGrid',
-                '.Section0 table',  # 直接查找Section0内的table
-                '.Section1 table',  # 直接查找Section1内的table
-                '.WordSection1 table'  # 直接查找WordSection1内的table
+                '.Section0 table',
+                '.Section1 table',
+                '.WordSection1 table'
            ]
            table = None
            for selector in selectors:
@ -152,15 +139,25 @@ def fetch_data(urls):

    return all_data

-
 def random_wait(min_time=1, max_time=3):
    time.sleep(random.uniform(min_time, max_time))

+def create_empty_excel(filename):
+    columns = ["行政处罚决定书文号", "被处罚当事人", "主要违法违规事实", "行政处罚依据", "行政处罚决定", "作出处罚决定的机关名称", "作出处罚决定的日期"]
+    df = pd.DataFrame(columns=columns)
+    df.to_excel(filename, index=False)

 def process_in_batches(urls, batch_size=100):
    total_urls = len(urls)
    num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)

+    file_index = 1
+    output_filename = f'output_data{file_index}.xlsx'
+    rows_in_file = 0
+
+    if not os.path.exists(output_filename):
+        create_empty_excel(output_filename)
+
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = start_index + batch_size
@ -170,19 +167,28 @@ def process_in_batches(urls, batch_size=100):
        batch_data = fetch_data(batch_urls)

        try:
-            existing_data = pd.read_excel('output_data2.xlsx', sheet_name='Sheet1')
+            existing_data = pd.read_excel(output_filename, sheet_name='Sheet1')
            combined_data = pd.concat([existing_data, batch_data], ignore_index=True)
        except FileNotFoundError:
            combined_data = batch_data

-        with pd.ExcelWriter('output_data2.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
+        with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
            combined_data.to_excel(writer, index=False, sheet_name='Sheet1')

-#TODO:初始创建表头字段
+        rows_in_file += batch_data.shape[0]
+
+        if rows_in_file >= 10000:
+            file_index += 1
+            output_filename = f'output_data{file_index}.xlsx'
+            rows_in_file = 0
+
+            if not os.path.exists(output_filename):
+                create_empty_excel(output_filename)
+
 # 读取URL列表
-with open('url2.txt', 'r') as file:
+with open('url1.txt', 'r') as file:
    urls = [line.strip() for line in file if line.strip()]

 # 分批处理URL并写入Excel
-process_in_batches(urls, batch_size=50)
+process_in_batches(urls, batch_size=100)
 print("Data has been appended to the existing Excel file.")
--- a/output_data.xlsx
+++ b/output_data.xlsx
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,30 @@
+attrs==23.2.0
+beautifulsoup4==4.12.3
+certifi==2024.7.4
+cffi==1.16.0
+charset-normalizer==3.3.2
+et-xmlfile==1.1.0
+exceptiongroup==1.2.2
+h11==0.14.0
+idna==3.7
+numpy==1.24.4
+openpyxl==3.1.5
+outcome==1.3.0.post0
+pandas==2.0.3
+pycparser==2.22
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+pytz==2024.1
+requests==2.32.3
+selenium==4.23.1
+six==1.16.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.5
+trio==0.26.0
+trio-websocket==0.11.1
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+websocket-client==1.8.0
+wsproto==1.2.0
--- a/scrape.py
+++ b/scrape.py
@ -18,7 +18,7 @@ def random_wait(min_time=1, max_time=5):
 def create_browser():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
-    # options.add_argument("--headless")  # Uncomment this line to use headless mode
+    options.add_argument("--headless")  # Enable headless mode
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
@ -43,30 +43,39 @@ def create_browser():
    return driver


-# Initialize WebDriver
-driver = create_browser()
-#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
-# Base URL information
-#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
-base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
-base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
+def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
+    # Initialize WebDriver
+    driver = create_browser()

-# Set to store unique URLs
-unique_urls = set()
+    # Set to store unique URLs
+    unique_urls = set()
+    total_urls_saved = 0  # Total count of URLs saved
+    urls_in_current_file = 0  # Count of URLs in the current file

-
-# Function to check date
-def is_date_valid(date_text):
+    # Function to check date
+    def is_date_valid(date_text):
        given_date = datetime.strptime(date_text, "%Y-%m-%d")
-    return given_date >= datetime(2023, 11, 28)
+        return given_date >= date_limit

+    # Function to save URLs to file
+    def save_urls_to_file(urls, file_index):
+        nonlocal total_urls_saved
+        nonlocal urls_in_current_file

-# Visit the initial page
-driver.get(base_page_url)
-cur_page = 0
+        with open(f"{output_path_prefix}{file_index}.txt", 'a') as file:
+            for url in urls:
+                file.write(url + '\n')
+                total_urls_saved += 1
+                urls_in_current_file += 1
+        print(f"URLs have been saved to {output_path_prefix}{file_index}.txt")

-# Keep processing until a date before June 1, 2023, is found
-while True:
+    # Visit the initial page
+    driver.get(base_page_url)
+    cur_page = 0
+    file_index = 1
+
+    # Keep processing until a date before the given date_limit is found
+    while True:
        cur_page += 1
        print("Visiting new page:" + str(cur_page))

@ -103,6 +112,16 @@ while True:
                    should_continue = False
                    break

+        # Save URLs if they exceed 2000 and reset unique_urls
+        if len(unique_urls) >= 10:
+            save_urls_to_file(unique_urls, file_index)
+            unique_urls.clear()
+
+            # If the current file exceeds 20000 URLs, start a new file
+            if urls_in_current_file >= 20:
+                file_index += 1
+                urls_in_current_file = 0
+
        # Check if loop should continue
        if not should_continue:
            break
@ -118,17 +137,19 @@ while True:
            print("No more pages or error occurred:", e)
            break

-# Close the browser
-driver.quit()
+    # Save remaining URLs if any
+    if unique_urls:
+        save_urls_to_file(unique_urls, file_index)

-# Print all unique URLs and count them
-cnt = 0
+    # Close the browser
+    driver.quit()

-# Open a file to write
-with open('url2.txt', 'w') as file:
-    for url in unique_urls:
-        cnt += 1
-        file.write(url + '\n')  # Write each URL followed by a newline
+    print("Total URLs saved:", total_urls_saved)

-print("URLs have been saved to urls1.txt")
-print("Total URLs found:", cnt)
+
+# Example usage
+base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
+base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
+date_limit = datetime(2024, 7, 24)
+output_path_prefix = 'url'
+fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)