This commit is contained in:
zhangsan 2024-08-02 14:20:40 +08:00
parent bc540c7be8
commit 06937c6de2
6 changed files with 178 additions and 121 deletions

View File

@ -1,2 +1,2 @@
ww
111
scrape.py脚本负责抓取网址
main_extraction.py负责抓取网页内容

View File

@ -4,19 +4,17 @@ from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import time
import random
import os
def clean_text(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
paragraphs = soup.find_all('p')
lines = []
for p in paragraphs:
# 如果 span 有子元素,比如 <span>(一)</span>,就忽略 span 标签
line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)])
lines.append(line)
return '\n'.join(lines).strip()
def process_table(table_rows):
results = {
"行政处罚决定书文号": "",
@ -53,30 +51,20 @@ def process_table(table_rows):
results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1]))
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
#TODO:键固定,值动态
else:
temp_dict = {}
for row in table_rows:
columns = row.find_all('td')
if len(columns) >= 2:
header = columns[0].get_text(strip=True)
if "违法违规" in header:
header = "主要违法违规事实"
if "机关名称" in header:
header = "作出处罚决定的机关名称"
if "日期" in header:
header = "作出处罚决定的日期"
content_html = str(columns[1])
content = clean_text(content_html)
temp_dict[header] = content
results = temp_dict
results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1]))
results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1]))
results["主要违法违规事实"] = clean_text(str(table_rows[2].find_all_next('td')[1]))
results["行政处罚依据"] = clean_text(str(table_rows[3].find_all_next('td')[1]))
results["行政处罚决定"] = clean_text(str(table_rows[4].find_all_next('td')[1]))
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[5].find_all_next('td')[1]))
results["作出处罚决定的日期"] = clean_text(str(table_rows[6].find_all_next('td')[1]))
except Exception as e:
print(f"Error processing table: {e}")
return results
def fetch_data(urls):
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 使用无头模式
@ -114,14 +102,13 @@ def fetch_data(urls):
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
# 尝试不同的选择器
selectors = [
'.Section0 .MsoNormalTable, .Section0 .MsoTableGrid',
'.Section1 .MsoNormalTable, .Section1 .MsoTableGrid',
'.WordSection1 .MsoNormalTable, .WordSection1 .MsoTableGrid',
'.Section0 table', # 直接查找Section0内的table
'.Section1 table', # 直接查找Section1内的table
'.WordSection1 table' # 直接查找WordSection1内的table
'.Section0 table',
'.Section1 table',
'.WordSection1 table'
]
table = None
for selector in selectors:
@ -152,15 +139,25 @@ def fetch_data(urls):
return all_data
def random_wait(min_time=1, max_time=3):
time.sleep(random.uniform(min_time, max_time))
def create_empty_excel(filename):
columns = ["行政处罚决定书文号", "被处罚当事人", "主要违法违规事实", "行政处罚依据", "行政处罚决定", "作出处罚决定的机关名称", "作出处罚决定的日期"]
df = pd.DataFrame(columns=columns)
df.to_excel(filename, index=False)
def process_in_batches(urls, batch_size=100):
total_urls = len(urls)
num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
file_index = 1
output_filename = f'output_data{file_index}.xlsx'
rows_in_file = 0
if not os.path.exists(output_filename):
create_empty_excel(output_filename)
for batch_num in range(num_batches):
start_index = batch_num * batch_size
end_index = start_index + batch_size
@ -170,19 +167,28 @@ def process_in_batches(urls, batch_size=100):
batch_data = fetch_data(batch_urls)
try:
existing_data = pd.read_excel('output_data2.xlsx', sheet_name='Sheet1')
existing_data = pd.read_excel(output_filename, sheet_name='Sheet1')
combined_data = pd.concat([existing_data, batch_data], ignore_index=True)
except FileNotFoundError:
combined_data = batch_data
with pd.ExcelWriter('output_data2.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
#TODO:初始创建表头字段
rows_in_file += batch_data.shape[0]
if rows_in_file >= 10000:
file_index += 1
output_filename = f'output_data{file_index}.xlsx'
rows_in_file = 0
if not os.path.exists(output_filename):
create_empty_excel(output_filename)
# 读取URL列表
with open('url2.txt', 'r') as file:
with open('url1.txt', 'r') as file:
urls = [line.strip() for line in file if line.strip()]
# 分批处理URL并写入Excel
process_in_batches(urls, batch_size=50)
process_in_batches(urls, batch_size=100)
print("Data has been appended to the existing Excel file.")

Binary file not shown.

30
requirements.txt Normal file
View File

@ -0,0 +1,30 @@
attrs==23.2.0
beautifulsoup4==4.12.3
certifi==2024.7.4
cffi==1.16.0
charset-normalizer==3.3.2
et-xmlfile==1.1.0
exceptiongroup==1.2.2
h11==0.14.0
idna==3.7
numpy==1.24.4
openpyxl==3.1.5
outcome==1.3.0.post0
pandas==2.0.3
pycparser==2.22
PySocks==1.7.1
python-dateutil==2.9.0.post0
pytz==2024.1
requests==2.32.3
selenium==4.23.1
six==1.16.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.5
trio==0.26.0
trio-websocket==0.11.1
typing_extensions==4.12.2
tzdata==2024.1
urllib3==2.2.2
websocket-client==1.8.0
wsproto==1.2.0

View File

@ -18,7 +18,7 @@ def random_wait(min_time=1, max_time=5):
def create_browser():
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
# options.add_argument("--headless") # Uncomment this line to use headless mode
options.add_argument("--headless") # Enable headless mode
options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
@ -43,30 +43,39 @@ def create_browser():
return driver
# Initialize WebDriver
driver = create_browser()
#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
# Base URL information
#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
# Initialize WebDriver
driver = create_browser()
# Set to store unique URLs
unique_urls = set()
# Set to store unique URLs
unique_urls = set()
total_urls_saved = 0 # Total count of URLs saved
urls_in_current_file = 0 # Count of URLs in the current file
# Function to check date
def is_date_valid(date_text):
# Function to check date
def is_date_valid(date_text):
given_date = datetime.strptime(date_text, "%Y-%m-%d")
return given_date >= datetime(2023, 11, 28)
return given_date >= date_limit
# Function to save URLs to file
def save_urls_to_file(urls, file_index):
nonlocal total_urls_saved
nonlocal urls_in_current_file
# Visit the initial page
driver.get(base_page_url)
cur_page = 0
with open(f"{output_path_prefix}{file_index}.txt", 'a') as file:
for url in urls:
file.write(url + '\n')
total_urls_saved += 1
urls_in_current_file += 1
print(f"URLs have been saved to {output_path_prefix}{file_index}.txt")
# Keep processing until a date before June 1, 2023, is found
while True:
# Visit the initial page
driver.get(base_page_url)
cur_page = 0
file_index = 1
# Keep processing until a date before the given date_limit is found
while True:
cur_page += 1
print("Visiting new page:" + str(cur_page))
@ -103,6 +112,16 @@ while True:
should_continue = False
break
# Save URLs if they exceed 2000 and reset unique_urls
if len(unique_urls) >= 10:
save_urls_to_file(unique_urls, file_index)
unique_urls.clear()
# If the current file exceeds 20000 URLs, start a new file
if urls_in_current_file >= 20:
file_index += 1
urls_in_current_file = 0
# Check if loop should continue
if not should_continue:
break
@ -118,17 +137,19 @@ while True:
print("No more pages or error occurred:", e)
break
# Close the browser
driver.quit()
# Save remaining URLs if any
if unique_urls:
save_urls_to_file(unique_urls, file_index)
# Print all unique URLs and count them
cnt = 0
# Close the browser
driver.quit()
# Open a file to write
with open('url2.txt', 'w') as file:
for url in unique_urls:
cnt += 1
file.write(url + '\n') # Write each URL followed by a newline
print("Total URLs saved:", total_urls_saved)
print("URLs have been saved to urls1.txt")
print("Total URLs found:", cnt)
# Example usage
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
date_limit = datetime(2024, 7, 24)
output_path_prefix = 'url'
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)