first

2024-07-26 23:44:36 +08:00 · 2024-07-26 23:44:36 +08:00 · f6db74c2a6
commit f6db74c2a6
12 changed files with 7053 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,19 @@
 # 忽略IDE配置文件
 .idea/
 # 忽略Python编译文件
 *.pyc
 __pycache__/
 # 忽略虚拟环境文件夹
 venv/
 # 忽略操作系统生成的文件
 .DS_Store
 Thumbs.db
 # 忽略日志文件
 *.log
 # 忽略临时文件
 *.tmp
--- a/Exported_Data.xlsx
+++ b/Exported_Data.xlsx
--- a/data.html
+++ b/data.html
--- a/extract_table.py
+++ b/extract_table.py
@ -0,0 +1,72 @@
 from selenium import webdriver
 from bs4 import BeautifulSoup
 import pandas as pd
 import time
 def clean_text(html_content):
    # 使用BeautifulSoup来解析内容，逐个元素提取文本，避免添加不必要的空格
    soup = BeautifulSoup(html_content, 'html.parser')
    text = ""  # 初始化一个空字符串用于拼接文本
    for element in soup.stripped_strings:  # 遍历所有文本节点，去除首尾空白
        if element == "一、" or element == "二、":  # 如果是列表标记，加入换行符
            text += "\n" + element
        else:
            text += element  # 直接拼接文本，不添加额外空格
    return text.strip()  # 返回处理后的文本
 def fetch_data(urls):
    # 设置Chrome选项以在后台运行
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    # 初始化WebDriver
    driver = webdriver.Chrome(options=options)
    # 初始化一个空的DataFrame，以存储最终数据
    all_data = pd.DataFrame()
    for url in urls:
        # 访问页面
        driver.get(url)
        time.sleep(3)  # 等待JavaScript执行
        # 获取页面源码
        html = driver.page_source
        # 使用BeautifulSoup解析HTML
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', class_='MsoNormalTable')
        # 如果页面上有表格
        if table:
            rows = table.find_all('tr')
            temp_dict = {}
            for row in rows:
                columns = row.find_all('td')
                if len(columns) >= 2:  # 确保每行至少有两个列（Header和Content）
                    header = columns[0].get_text(strip=True)
                    content_html = str(columns[1])  # 获取原始HTML内容
                    content = clean_text(content_html)  # 清洗并去除不必要的空格
                    temp_dict[header] = content
            # 将字典转换为DataFrame，并添加到总的DataFrame中
            df = pd.DataFrame([temp_dict])
            all_data = pd.concat([all_data, df], ignore_index=True)
    # 关闭浏览器
    driver.quit()
    return all_data
 # 定义要处理的URL列表
 urls = [
    "https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1171824&itemId=4115&generaltype=9",
    # 可以添加更多的URL
 ]
 # 调用函数并获取数据
 result_data = fetch_data(urls)
 # 保存到Excel文件
 result_data.to_excel('output_data.xlsx', index=False)
--- a/extract_table2.py
+++ b/extract_table2.py
@ -0,0 +1,47 @@
 import pandas as pd
 from bs4 import BeautifulSoup
 # 从文件中读取HTML内容
 with open('D:/folder/study/reptile-project/data.html', 'r', encoding='utf-8') as file:
    html_content = file.read()
 # 使用BeautifulSoup解析HTML
 soup = BeautifulSoup(html_content, 'html.parser')
 # 初始化结果字典
 results = {
    "行政处罚决定书文号": "",
    "被处罚当事人": "",
    "主要违法违规事实": "",
    "行政处罚依据": "",
    "行政处罚决定": "",
    "作出处罚决定的机关名称": "",
    "作出处罚决定的日期": ""
 }
 # 获取所有的tr元素
 table_rows = soup.find_all('tr')
 # 提取信息
 if len(table_rows) >= 9:
    results["行政处罚决定书文号"] = table_rows[0].find_all('td')[1].find('p').get_text(strip=True)
    # 个人姓名、单位名称、单位法定代表人姓名
    person_name = table_rows[1].find_all('td')[2].find('p').get_text(strip=True)
    org_name = table_rows[2].find_all('td')[2].find('p').get_text(strip=True)
    legal_rep_name = table_rows[3].find_all('td')[1].find('p').get_text(strip=True)
    # 格式化被处罚当事人信息
    results["被处罚当事人"] = f'"个人姓名": "{person_name}"\n"单位名称": "{org_name}"\n"单位法定代表人（主要负责人）姓名": "{legal_rep_name}"'
    results["主要违法违规事实"] = table_rows[4].find_all('td')[1].find('p').get_text(strip=True)
    results["行政处罚依据"] = table_rows[5].find_all('td')[1].find('p').get_text(strip=True)
    results["行政处罚决定"] = table_rows[6].find_all('td')[1].find('p').get_text(strip=True)
    results["作出处罚决定的机关名称"] = table_rows[7].find_all('td')[1].find('p').get_text(strip=True)
    results["作出处罚决定的日期"] = table_rows[8].find_all('td')[1].find('p').get_text(strip=True)
 # 创建DataFrame
 df = pd.DataFrame([results])
 # 保存DataFrame到Excel文件
 df.to_excel('output_data.xlsx', index=False, engine='openpyxl')
--- a/main.py
+++ b/main.py
--- a/main_extraction.py
+++ b/main_extraction.py
@ -0,0 +1,188 @@
 import pandas as pd
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.common.action_chains import ActionChains
 import time
 import random
 def clean_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    lines = []
    for p in paragraphs:
        # 如果 span 有子元素，比如 <span>（一）</span>，就忽略 span 标签
        line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)])
        lines.append(line)
    return '\n'.join(lines).strip()
 def process_table(table_rows):
    results = {
        "行政处罚决定书文号": "",
        "被处罚当事人": "",
        "主要违法违规事实": "",
        "行政处罚依据": "",
        "行政处罚决定": "",
        "作出处罚决定的机关名称": "",
        "作出处罚决定的日期": ""
    }
    try:
        if len(table_rows) == 9:
            results["行政处罚决定书文号"] = clean_text(str(table_rows[0].find_all('td')[1]))
            person_name = clean_text(str(table_rows[1].find_all('td')[2]))
            org_name = clean_text(str(table_rows[2].find_all('td')[2]))
            legal_rep_name = clean_text(str(table_rows[3].find_all('td')[1]))
            results["被处罚当事人"] = f'"个人姓名": "{person_name}"\n"单位名称": "{org_name}"\n"单位法定代表人（主要负责人）姓名": "{legal_rep_name}"'
            results["主要违法违规事实"] = clean_text(str(table_rows[4].find_all('td')[1]))
            results["行政处罚依据"] = clean_text(str(table_rows[5].find_all('td')[1]))
            results["行政处罚决定"] = clean_text(str(table_rows[6].find_all('td')[1]))
            results["作出处罚决定的机关名称"] = clean_text(str(table_rows[7].find_all('td')[1]))
            results["作出处罚决定的日期"] = clean_text(str(table_rows[8].find_all('td')[1]))
        elif len(table_rows) == 10:
            results["行政处罚决定书文号"] = clean_text(str(table_rows[0].find_all('td')[1]))
            person_name = clean_text(str(table_rows[1].find_all('td')[3]))
            person_org = clean_text(str(table_rows[2].find_all('td')[1]))
            org_name = clean_text(str(table_rows[3].find_all('td')[2]))
            legal_rep_name = clean_text(str(table_rows[4].find_all('td')[1]))
            results["被处罚当事人"] = f'"个人姓名": "{person_name}"\n"个人单位": "{person_org}"\n"单位名称": "{org_name}"\n"单位法定代表人（主要负责人）姓名": "{legal_rep_name}"'
            results["主要违法违规事实"] = clean_text(str(table_rows[5].find_all('td')[1]))
            results["行政处罚依据"] = clean_text(str(table_rows[6].find_all('td')[1]))
            results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1]))
            results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
            results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
        else:
            temp_dict = {}
            for row in table_rows:
                columns = row.find_all('td')
                if len(columns) >= 2:
                    header = columns[0].get_text(strip=True)
                    if "违法违规" in header:
                        header = "主要违法违规事实"
                    if "机关名称" in header:
                        header = "作出处罚决定的机关名称"
                    if "日期" in header:
                        header = "作出处罚决定的日期"
                    content_html = str(columns[1])
                    content = clean_text(content_html)
                    temp_dict[header] = content
            results = temp_dict
    except Exception as e:
        print(f"Error processing table: {e}")
    return results
 def fetch_data(urls):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # 使用无头模式
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument(
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': '''
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            });
            window.navigator.chrome = {
                runtime: {}
            };
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en']
            });
            Object.defineProperty(navigator, 'plugins', {
                get: () => [1, 2, 3, 4, 5]
            });
        '''
    })
    all_data = pd.DataFrame()
    error_urls = []
    for url in urls:
        try:
            driver.get(url)
            print("Processing URL:", url)
            random_wait(1, 3)  # 随机等待时间
            html = driver.page_source
            soup = BeautifulSoup(html, 'html.parser')
            # 尝试不同的选择器
            selectors = [
                '.Section0 .MsoNormalTable, .Section0 .MsoTableGrid',
                '.Section1 .MsoNormalTable, .Section1 .MsoTableGrid',
                '.WordSection1 .MsoNormalTable, .WordSection1 .MsoTableGrid',
                '.Section0 table',  # 直接查找Section0内的table
                '.Section1 table',  # 直接查找Section1内的table
                '.WordSection1 table'  # 直接查找WordSection1内的table
            ]
            table = None
            for selector in selectors:
                table = soup.select_one(selector)
                if table:
                    break
            if table:
                table_rows = table.find_all('tr')
                results = process_table(table_rows)
                df = pd.DataFrame([results])
                all_data = pd.concat([all_data, df], ignore_index=True)
            else:
                print(f"No table found for URL: {url}")
                error_urls.append(url)
        except Exception as e:
            print(f"Error processing URL {url}: {e}")
            error_urls.append(url)
    driver.quit()
    if error_urls:
        with open('error_urls.txt', 'w') as file:
            for error_url in error_urls:
                file.write(f"{error_url}\n")
        print(f"Error URLs have been saved to error_urls.txt")
    return all_data
 def random_wait(min_time=1, max_time=3):
    time.sleep(random.uniform(min_time, max_time))
 def process_in_batches(urls, batch_size=100):
    total_urls = len(urls)
    num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = start_index + batch_size
        batch_urls = urls[start_index:end_index]
        print(f"Processing batch {batch_num + 1} of {num_batches}")
        batch_data = fetch_data(batch_urls)
        try:
            existing_data = pd.read_excel('output_data.xlsx', sheet_name='Sheet1')
            combined_data = pd.concat([existing_data, batch_data], ignore_index=True)
        except FileNotFoundError:
            combined_data = batch_data
        with pd.ExcelWriter('output_data.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
            combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
 # 读取URL列表
 with open('urls.txt', 'r') as file:
    urls = [line.strip() for line in file if line.strip()]
 # 分批处理URL并写入Excel
 process_in_batches(urls, batch_size=50)
 print("Data has been appended to the existing Excel file.")
--- a/output_data.xlsx
+++ b/output_data.xlsx
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,133 @@
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.common.action_chains import ActionChains
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 from bs4 import BeautifulSoup
 import time
 from datetime import datetime
 import random
 # Function to generate a random waiting time
 def random_wait(min_time=1, max_time=5):
    time.sleep(random.uniform(min_time, max_time))
 # Function to create a new browser session with options to avoid detection
 def create_browser():
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-blink-features=AutomationControlled")
    # options.add_argument("--headless")  # Uncomment this line to use headless mode
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=options)
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': '''
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            });
            window.navigator.chrome = {
                runtime: {}
            };
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en']
            });
            Object.defineProperty(navigator, 'plugins', {
                get: () => [1, 2, 3, 4, 5]
            });
        '''
    })
    return driver
 # Initialize WebDriver
 driver = create_browser()
 # Base URL information
 base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
 base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
 # Set to store unique URLs
 unique_urls = set()
 # Function to check date
 def is_date_valid(date_text):
    given_date = datetime.strptime(date_text, "%Y-%m-%d")
    return given_date >= datetime(2023, 6, 1)
 # Visit the initial page
 driver.get(base_page_url)
 cur_page = 0
 # Keep processing until a date before June 1, 2023, is found
 while True:
    cur_page += 1
    print("Visiting new page:" + str(cur_page))
    # Wait for JavaScript to load
    random_wait()
    # Get the page source after JS execution
    html = driver.page_source
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    # Find all <div> elements that match class conditions
    div_elements = soup.find_all('div', class_="panel-row ng-scope")
    # Variable to determine if loop should continue
    should_continue = False
    # Iterate through the div elements to find links and dates
    for div in div_elements:
        date_span = div.find('span', class_='date ng-binding')
        if date_span:
            date_text = date_span.text.strip()
            if is_date_valid(date_text):
                should_continue = True
                link = div.find('a', href=True, attrs={'ng-bind-html': 'x.docSubtitle|trustHtml'})
                if link and "处罚信息公开表" in link.text:
                    href = link['href']
                    full_url = base_url + href
                    if "//cn/view/pages/" not in full_url:
                        unique_urls.add(full_url)
            else:
                # Since this date is invalid and dates are sorted in descending order, no need to continue
                should_continue = False
                break
    # Check if loop should continue
    if not should_continue:
        break
    # Try to find and click the next page button
    try:
        next_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']"))
        )
        ActionChains(driver).move_to_element(next_button).click().perform()
        random_wait()  # Wait for the next page to load
    except Exception as e:
        print("No more pages or error occurred:", e)
        break
 # Close the browser
 driver.quit()
 # Print all unique URLs and count them
 cnt = 0
 # Open a file to write
 with open('urls.txt', 'w') as file:
    for url in unique_urls:
        cnt += 1
        file.write(url + '\n')  # Write each URL followed by a newline
 print("URLs have been saved to urls1.txt")
 print("Total URLs found:", cnt)
--- a/testurl.txt
+++ b/testurl.txt
@ -0,0 +1 @@
 https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1116560&itemId=4115&generaltype=9
--- a/urls.txt
+++ b/urls.txt
--- a/urls1.txt
+++ b/urls1.txt
		`@ -0,0 +1 @@`
							`https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1116560&itemId=4115&generaltype=9`