diff --git a/error_urls.txt b/error_urls.txt index e69de29..2ee9dc6 100644 --- a/error_urls.txt +++ b/error_urls.txt @@ -0,0 +1 @@ +https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=107176&itemId=4114&generaltype=9 diff --git a/main_extraction.py b/main_extraction.py index faf4b9c..281fbb8 100644 --- a/main_extraction.py +++ b/main_extraction.py @@ -7,16 +7,35 @@ import random import os import glob -def clean_text(html_content): - soup = BeautifulSoup(html_content, 'html.parser') - paragraphs = soup.find_all('p') - lines = [] - for p in paragraphs: - line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)]) - lines.append(line) - return '\n'.join(lines).strip() +# def clean_text(html_content): +# soup = BeautifulSoup(html_content, 'html.parser') +# paragraphs = soup.find_all('p') +# lines = [] +# for p in paragraphs: +# line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)]) +# lines.append(line) +# return '\n'.join(lines).strip() -def process_table(table_rows): +def clean_text(html_content): + import re + + soup = BeautifulSoup(html_content, 'html.parser') + + # 移除脚本和样式内容 + for script_or_style in soup(["script", "style", "o:p"]): + script_or_style.decompose() + + # 提取所有可见文本 + text = soup.get_text(strip=True) + + # 清理和整理文本,去除所有空格 + cleaned_text = re.sub(r'\s+', '', text) # 去除所有空格 + + return cleaned_text + + + +def process_table(table_rows,current_url, error_urls): results = { "行政处罚决定书文号": "", "被处罚当事人": "", @@ -24,7 +43,8 @@ def process_table(table_rows): "行政处罚依据": "", "行政处罚决定": "", "作出处罚决定的机关名称": "", - "作出处罚决定的日期": "" + "作出处罚决定的日期": "", + "网址":current_url # 新增URL列 } try: @@ -53,28 +73,57 @@ def process_table(table_rows): results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1])) results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1])) elif len(table_rows) == 8: + flag = 0 + for row in table_rows: + cells = row.find_all('td') + if len(cells) == 3: + flag = 1 results["行政处罚决定书文号"] = clean_text(str(table_rows[0].find_all('td')[1])) - org_name=clean_text(str(table_rows[1].find_all('td')[2])) - name=clean_text(str(table_rows[2].find_all('td')[1])) - results["被处罚当事人"] = f'"单位名称": "{org_name}"\n"主要负责人姓名": "{name}"' + if(flag): + org_name=clean_text(str(table_rows[1].find_all('td')[2])) + name=clean_text(str(table_rows[2].find_all('td')[1])) + results["被处罚当事人"] = f'"单位名称": "{org_name}"\n"主要负责人姓名": "{name}"' + else: + part1 = clean_text(str(table_rows[1].find_all('td')[0])) + ":" + clean_text( + str(table_rows[1].find_all('td')[1])) + part2 = clean_text(str(table_rows[2].find_all('td')[0])) + ":" + clean_text( + str(table_rows[2].find_all('td')[1])) + results["被处罚当事人"] = part1 + "\n" + part2 # 使用换行符分隔 results["主要违法违规事实"] = clean_text(str(table_rows[3].find_all('td')[1])) results["行政处罚依据"] = clean_text(str(table_rows[4].find_all('td')[1])) results["行政处罚决定"] = clean_text(str(table_rows[5].find_all('td')[1])) results["作出处罚决定的机关名称"] = clean_text(str(table_rows[6].find_all('td')[1])) results["作出处罚决定的日期"] = clean_text(str(table_rows[7].find_all('td')[1])) - else: + elif len(table_rows)==7: + flag=0 + for row in table_rows: + cells = row.find_all('td') + if len(cells) == 3: + # 检查并拼接具有三个td的行的第二和第三个td内容 + name = clean_text(str(cells[1])) + ":" + clean_text(str(cells[2])) + print(name) + flag=1 + if(flag): + results["被处罚当事人"] = name + else: + results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1])) results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1])) - results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1])) results["主要违法违规事实"] = clean_text(str(table_rows[2].find_all_next('td')[1])) results["行政处罚依据"] = clean_text(str(table_rows[3].find_all_next('td')[1])) results["行政处罚决定"] = clean_text(str(table_rows[4].find_all_next('td')[1])) results["作出处罚决定的机关名称"] = clean_text(str(table_rows[5].find_all_next('td')[1])) results["作出处罚决定的日期"] = clean_text(str(table_rows[6].find_all_next('td')[1])) + else: + print(f"Unexpected number of rows in table at URL: {current_url}") + error_urls.append(current_url) + return None + except Exception as e: print(f"Error processing table: {e}") - + error_urls.append(current_url) + return None return results def create_browser(): @@ -144,7 +193,7 @@ def fetch_data(urls): if table: table_rows = table.find_all('tr') - results = process_table(table_rows) + results = process_table(table_rows,url,error_urls) df = pd.DataFrame([results]) all_data = pd.concat([all_data, df], ignore_index=True) else: