8.4改进版
This commit is contained in:
parent
28a57fec78
commit
b6f95d5498
@ -0,0 +1 @@
|
|||||||
|
https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=107176&itemId=4114&generaltype=9
|
@ -7,16 +7,35 @@ import random
|
|||||||
import os
|
import os
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
def clean_text(html_content):
|
# def clean_text(html_content):
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
# soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
paragraphs = soup.find_all('p')
|
# paragraphs = soup.find_all('p')
|
||||||
lines = []
|
# lines = []
|
||||||
for p in paragraphs:
|
# for p in paragraphs:
|
||||||
line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)])
|
# line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)])
|
||||||
lines.append(line)
|
# lines.append(line)
|
||||||
return '\n'.join(lines).strip()
|
# return '\n'.join(lines).strip()
|
||||||
|
|
||||||
def process_table(table_rows):
|
def clean_text(html_content):
|
||||||
|
import re
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
|
||||||
|
# 移除脚本和样式内容
|
||||||
|
for script_or_style in soup(["script", "style", "o:p"]):
|
||||||
|
script_or_style.decompose()
|
||||||
|
|
||||||
|
# 提取所有可见文本
|
||||||
|
text = soup.get_text(strip=True)
|
||||||
|
|
||||||
|
# 清理和整理文本,去除所有空格
|
||||||
|
cleaned_text = re.sub(r'\s+', '', text) # 去除所有空格
|
||||||
|
|
||||||
|
return cleaned_text
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def process_table(table_rows,current_url, error_urls):
|
||||||
results = {
|
results = {
|
||||||
"行政处罚决定书文号": "",
|
"行政处罚决定书文号": "",
|
||||||
"被处罚当事人": "",
|
"被处罚当事人": "",
|
||||||
@ -24,7 +43,8 @@ def process_table(table_rows):
|
|||||||
"行政处罚依据": "",
|
"行政处罚依据": "",
|
||||||
"行政处罚决定": "",
|
"行政处罚决定": "",
|
||||||
"作出处罚决定的机关名称": "",
|
"作出处罚决定的机关名称": "",
|
||||||
"作出处罚决定的日期": ""
|
"作出处罚决定的日期": "",
|
||||||
|
"网址":current_url # 新增URL列
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -53,28 +73,57 @@ def process_table(table_rows):
|
|||||||
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
|
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
|
||||||
results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
|
results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
|
||||||
elif len(table_rows) == 8:
|
elif len(table_rows) == 8:
|
||||||
|
flag = 0
|
||||||
|
for row in table_rows:
|
||||||
|
cells = row.find_all('td')
|
||||||
|
if len(cells) == 3:
|
||||||
|
flag = 1
|
||||||
results["行政处罚决定书文号"] = clean_text(str(table_rows[0].find_all('td')[1]))
|
results["行政处罚决定书文号"] = clean_text(str(table_rows[0].find_all('td')[1]))
|
||||||
org_name=clean_text(str(table_rows[1].find_all('td')[2]))
|
if(flag):
|
||||||
name=clean_text(str(table_rows[2].find_all('td')[1]))
|
org_name=clean_text(str(table_rows[1].find_all('td')[2]))
|
||||||
results["被处罚当事人"] = f'"单位名称": "{org_name}"\n"主要负责人姓名": "{name}"'
|
name=clean_text(str(table_rows[2].find_all('td')[1]))
|
||||||
|
results["被处罚当事人"] = f'"单位名称": "{org_name}"\n"主要负责人姓名": "{name}"'
|
||||||
|
else:
|
||||||
|
part1 = clean_text(str(table_rows[1].find_all('td')[0])) + ":" + clean_text(
|
||||||
|
str(table_rows[1].find_all('td')[1]))
|
||||||
|
part2 = clean_text(str(table_rows[2].find_all('td')[0])) + ":" + clean_text(
|
||||||
|
str(table_rows[2].find_all('td')[1]))
|
||||||
|
results["被处罚当事人"] = part1 + "\n" + part2 # 使用换行符分隔
|
||||||
results["主要违法违规事实"] = clean_text(str(table_rows[3].find_all('td')[1]))
|
results["主要违法违规事实"] = clean_text(str(table_rows[3].find_all('td')[1]))
|
||||||
results["行政处罚依据"] = clean_text(str(table_rows[4].find_all('td')[1]))
|
results["行政处罚依据"] = clean_text(str(table_rows[4].find_all('td')[1]))
|
||||||
results["行政处罚决定"] = clean_text(str(table_rows[5].find_all('td')[1]))
|
results["行政处罚决定"] = clean_text(str(table_rows[5].find_all('td')[1]))
|
||||||
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[6].find_all('td')[1]))
|
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[6].find_all('td')[1]))
|
||||||
results["作出处罚决定的日期"] = clean_text(str(table_rows[7].find_all('td')[1]))
|
results["作出处罚决定的日期"] = clean_text(str(table_rows[7].find_all('td')[1]))
|
||||||
|
|
||||||
else:
|
elif len(table_rows)==7:
|
||||||
|
flag=0
|
||||||
|
for row in table_rows:
|
||||||
|
cells = row.find_all('td')
|
||||||
|
if len(cells) == 3:
|
||||||
|
# 检查并拼接具有三个td的行的第二和第三个td内容
|
||||||
|
name = clean_text(str(cells[1])) + ":" + clean_text(str(cells[2]))
|
||||||
|
print(name)
|
||||||
|
flag=1
|
||||||
|
if(flag):
|
||||||
|
results["被处罚当事人"] = name
|
||||||
|
else:
|
||||||
|
results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1]))
|
||||||
results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1]))
|
results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1]))
|
||||||
results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1]))
|
|
||||||
results["主要违法违规事实"] = clean_text(str(table_rows[2].find_all_next('td')[1]))
|
results["主要违法违规事实"] = clean_text(str(table_rows[2].find_all_next('td')[1]))
|
||||||
results["行政处罚依据"] = clean_text(str(table_rows[3].find_all_next('td')[1]))
|
results["行政处罚依据"] = clean_text(str(table_rows[3].find_all_next('td')[1]))
|
||||||
results["行政处罚决定"] = clean_text(str(table_rows[4].find_all_next('td')[1]))
|
results["行政处罚决定"] = clean_text(str(table_rows[4].find_all_next('td')[1]))
|
||||||
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[5].find_all_next('td')[1]))
|
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[5].find_all_next('td')[1]))
|
||||||
results["作出处罚决定的日期"] = clean_text(str(table_rows[6].find_all_next('td')[1]))
|
results["作出处罚决定的日期"] = clean_text(str(table_rows[6].find_all_next('td')[1]))
|
||||||
|
else:
|
||||||
|
print(f"Unexpected number of rows in table at URL: {current_url}")
|
||||||
|
error_urls.append(current_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing table: {e}")
|
print(f"Error processing table: {e}")
|
||||||
|
error_urls.append(current_url)
|
||||||
|
return None
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def create_browser():
|
def create_browser():
|
||||||
@ -144,7 +193,7 @@ def fetch_data(urls):
|
|||||||
|
|
||||||
if table:
|
if table:
|
||||||
table_rows = table.find_all('tr')
|
table_rows = table.find_all('tr')
|
||||||
results = process_table(table_rows)
|
results = process_table(table_rows,url,error_urls)
|
||||||
df = pd.DataFrame([results])
|
df = pd.DataFrame([results])
|
||||||
all_data = pd.concat([all_data, df], ignore_index=True)
|
all_data = pd.concat([all_data, df], ignore_index=True)
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user