2024.8.2
This commit is contained in:
parent
bc540c7be8
commit
06937c6de2
@ -4,19 +4,17 @@ from selenium import webdriver
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
import time
|
||||
import random
|
||||
|
||||
import os
|
||||
|
||||
def clean_text(html_content):
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
paragraphs = soup.find_all('p')
|
||||
lines = []
|
||||
for p in paragraphs:
|
||||
# 如果 span 有子元素,比如 <span>(一)</span>,就忽略 span 标签
|
||||
line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)])
|
||||
lines.append(line)
|
||||
return '\n'.join(lines).strip()
|
||||
|
||||
|
||||
def process_table(table_rows):
|
||||
results = {
|
||||
"行政处罚决定书文号": "",
|
||||
@ -53,30 +51,20 @@ def process_table(table_rows):
|
||||
results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1]))
|
||||
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
|
||||
results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
|
||||
#TODO:键固定,值动态
|
||||
else:
|
||||
temp_dict = {}
|
||||
for row in table_rows:
|
||||
columns = row.find_all('td')
|
||||
if len(columns) >= 2:
|
||||
header = columns[0].get_text(strip=True)
|
||||
if "违法违规" in header:
|
||||
header = "主要违法违规事实"
|
||||
if "机关名称" in header:
|
||||
header = "作出处罚决定的机关名称"
|
||||
if "日期" in header:
|
||||
header = "作出处罚决定的日期"
|
||||
content_html = str(columns[1])
|
||||
content = clean_text(content_html)
|
||||
temp_dict[header] = content
|
||||
results = temp_dict
|
||||
results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1]))
|
||||
results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1]))
|
||||
results["主要违法违规事实"] = clean_text(str(table_rows[2].find_all_next('td')[1]))
|
||||
results["行政处罚依据"] = clean_text(str(table_rows[3].find_all_next('td')[1]))
|
||||
results["行政处罚决定"] = clean_text(str(table_rows[4].find_all_next('td')[1]))
|
||||
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[5].find_all_next('td')[1]))
|
||||
results["作出处罚决定的日期"] = clean_text(str(table_rows[6].find_all_next('td')[1]))
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing table: {e}")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def fetch_data(urls):
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless') # 使用无头模式
|
||||
@ -114,14 +102,13 @@ def fetch_data(urls):
|
||||
html = driver.page_source
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# 尝试不同的选择器
|
||||
selectors = [
|
||||
'.Section0 .MsoNormalTable, .Section0 .MsoTableGrid',
|
||||
'.Section1 .MsoNormalTable, .Section1 .MsoTableGrid',
|
||||
'.WordSection1 .MsoNormalTable, .WordSection1 .MsoTableGrid',
|
||||
'.Section0 table', # 直接查找Section0内的table
|
||||
'.Section1 table', # 直接查找Section1内的table
|
||||
'.WordSection1 table' # 直接查找WordSection1内的table
|
||||
'.Section0 table',
|
||||
'.Section1 table',
|
||||
'.WordSection1 table'
|
||||
]
|
||||
table = None
|
||||
for selector in selectors:
|
||||
@ -152,15 +139,25 @@ def fetch_data(urls):
|
||||
|
||||
return all_data
|
||||
|
||||
|
||||
def random_wait(min_time=1, max_time=3):
|
||||
time.sleep(random.uniform(min_time, max_time))
|
||||
|
||||
def create_empty_excel(filename):
|
||||
columns = ["行政处罚决定书文号", "被处罚当事人", "主要违法违规事实", "行政处罚依据", "行政处罚决定", "作出处罚决定的机关名称", "作出处罚决定的日期"]
|
||||
df = pd.DataFrame(columns=columns)
|
||||
df.to_excel(filename, index=False)
|
||||
|
||||
def process_in_batches(urls, batch_size=100):
|
||||
total_urls = len(urls)
|
||||
num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
|
||||
|
||||
file_index = 1
|
||||
output_filename = f'output_data{file_index}.xlsx'
|
||||
rows_in_file = 0
|
||||
|
||||
if not os.path.exists(output_filename):
|
||||
create_empty_excel(output_filename)
|
||||
|
||||
for batch_num in range(num_batches):
|
||||
start_index = batch_num * batch_size
|
||||
end_index = start_index + batch_size
|
||||
@ -170,19 +167,28 @@ def process_in_batches(urls, batch_size=100):
|
||||
batch_data = fetch_data(batch_urls)
|
||||
|
||||
try:
|
||||
existing_data = pd.read_excel('output_data2.xlsx', sheet_name='Sheet1')
|
||||
existing_data = pd.read_excel(output_filename, sheet_name='Sheet1')
|
||||
combined_data = pd.concat([existing_data, batch_data], ignore_index=True)
|
||||
except FileNotFoundError:
|
||||
combined_data = batch_data
|
||||
|
||||
with pd.ExcelWriter('output_data2.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
|
||||
with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
|
||||
combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
|
||||
|
||||
#TODO:初始创建表头字段
|
||||
rows_in_file += batch_data.shape[0]
|
||||
|
||||
if rows_in_file >= 10000:
|
||||
file_index += 1
|
||||
output_filename = f'output_data{file_index}.xlsx'
|
||||
rows_in_file = 0
|
||||
|
||||
if not os.path.exists(output_filename):
|
||||
create_empty_excel(output_filename)
|
||||
|
||||
# 读取URL列表
|
||||
with open('url2.txt', 'r') as file:
|
||||
with open('url1.txt', 'r') as file:
|
||||
urls = [line.strip() for line in file if line.strip()]
|
||||
|
||||
# 分批处理URL并写入Excel
|
||||
process_in_batches(urls, batch_size=50)
|
||||
process_in_batches(urls, batch_size=100)
|
||||
print("Data has been appended to the existing Excel file.")
|
||||
|
BIN
output_data.xlsx
BIN
output_data.xlsx
Binary file not shown.
30
requirements.txt
Normal file
30
requirements.txt
Normal file
@ -0,0 +1,30 @@
|
||||
attrs==23.2.0
|
||||
beautifulsoup4==4.12.3
|
||||
certifi==2024.7.4
|
||||
cffi==1.16.0
|
||||
charset-normalizer==3.3.2
|
||||
et-xmlfile==1.1.0
|
||||
exceptiongroup==1.2.2
|
||||
h11==0.14.0
|
||||
idna==3.7
|
||||
numpy==1.24.4
|
||||
openpyxl==3.1.5
|
||||
outcome==1.3.0.post0
|
||||
pandas==2.0.3
|
||||
pycparser==2.22
|
||||
PySocks==1.7.1
|
||||
python-dateutil==2.9.0.post0
|
||||
pytz==2024.1
|
||||
requests==2.32.3
|
||||
selenium==4.23.1
|
||||
six==1.16.0
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.5
|
||||
trio==0.26.0
|
||||
trio-websocket==0.11.1
|
||||
typing_extensions==4.12.2
|
||||
tzdata==2024.1
|
||||
urllib3==2.2.2
|
||||
websocket-client==1.8.0
|
||||
wsproto==1.2.0
|
57
scrape.py
57
scrape.py
@ -18,7 +18,7 @@ def random_wait(min_time=1, max_time=5):
|
||||
def create_browser():
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
# options.add_argument("--headless") # Uncomment this line to use headless mode
|
||||
options.add_argument("--headless") # Enable headless mode
|
||||
options.add_argument(
|
||||
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
@ -43,29 +43,38 @@ def create_browser():
|
||||
return driver
|
||||
|
||||
|
||||
def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
||||
# Initialize WebDriver
|
||||
driver = create_browser()
|
||||
#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||
# Base URL information
|
||||
#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||
|
||||
# Set to store unique URLs
|
||||
unique_urls = set()
|
||||
|
||||
total_urls_saved = 0 # Total count of URLs saved
|
||||
urls_in_current_file = 0 # Count of URLs in the current file
|
||||
|
||||
# Function to check date
|
||||
def is_date_valid(date_text):
|
||||
given_date = datetime.strptime(date_text, "%Y-%m-%d")
|
||||
return given_date >= datetime(2023, 11, 28)
|
||||
return given_date >= date_limit
|
||||
|
||||
# Function to save URLs to file
|
||||
def save_urls_to_file(urls, file_index):
|
||||
nonlocal total_urls_saved
|
||||
nonlocal urls_in_current_file
|
||||
|
||||
with open(f"{output_path_prefix}{file_index}.txt", 'a') as file:
|
||||
for url in urls:
|
||||
file.write(url + '\n')
|
||||
total_urls_saved += 1
|
||||
urls_in_current_file += 1
|
||||
print(f"URLs have been saved to {output_path_prefix}{file_index}.txt")
|
||||
|
||||
# Visit the initial page
|
||||
driver.get(base_page_url)
|
||||
cur_page = 0
|
||||
file_index = 1
|
||||
|
||||
# Keep processing until a date before June 1, 2023, is found
|
||||
# Keep processing until a date before the given date_limit is found
|
||||
while True:
|
||||
cur_page += 1
|
||||
print("Visiting new page:" + str(cur_page))
|
||||
@ -103,6 +112,16 @@ while True:
|
||||
should_continue = False
|
||||
break
|
||||
|
||||
# Save URLs if they exceed 2000 and reset unique_urls
|
||||
if len(unique_urls) >= 10:
|
||||
save_urls_to_file(unique_urls, file_index)
|
||||
unique_urls.clear()
|
||||
|
||||
# If the current file exceeds 20000 URLs, start a new file
|
||||
if urls_in_current_file >= 20:
|
||||
file_index += 1
|
||||
urls_in_current_file = 0
|
||||
|
||||
# Check if loop should continue
|
||||
if not should_continue:
|
||||
break
|
||||
@ -118,17 +137,19 @@ while True:
|
||||
print("No more pages or error occurred:", e)
|
||||
break
|
||||
|
||||
# Save remaining URLs if any
|
||||
if unique_urls:
|
||||
save_urls_to_file(unique_urls, file_index)
|
||||
|
||||
# Close the browser
|
||||
driver.quit()
|
||||
|
||||
# Print all unique URLs and count them
|
||||
cnt = 0
|
||||
print("Total URLs saved:", total_urls_saved)
|
||||
|
||||
# Open a file to write
|
||||
with open('url2.txt', 'w') as file:
|
||||
for url in unique_urls:
|
||||
cnt += 1
|
||||
file.write(url + '\n') # Write each URL followed by a newline
|
||||
|
||||
print("URLs have been saved to urls1.txt")
|
||||
print("Total URLs found:", cnt)
|
||||
# Example usage
|
||||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||
date_limit = datetime(2024, 7, 24)
|
||||
output_path_prefix = 'url'
|
||||
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)
|
||||
|
Loading…
x
Reference in New Issue
Block a user