This commit is contained in:
zhangsan 2024-08-01 21:39:34 +08:00
parent bda2c87625
commit bc540c7be8
3 changed files with 2 additions and 6470 deletions

View File

@ -53,7 +53,7 @@ def process_table(table_rows):
results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1]))
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
#TODO:键固定,值动态
else:
temp_dict = {}
for row in table_rows:
@ -178,7 +178,7 @@ def process_in_batches(urls, batch_size=100):
with pd.ExcelWriter('output_data2.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
#TODO:初始创建表头字段
# 读取URL列表
with open('url2.txt', 'r') as file:
urls = [line.strip() for line in file if line.strip()]

View File

@ -1,9 +0,0 @@
https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1144537&itemId=4115&generaltype=9
https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1148013&itemId=4115&generaltype=9
https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1138922&itemId=4115&generaltype=9
https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1138462&itemId=4115&generaltype=9

6459
urls1.txt

File diff suppressed because it is too large Load Diff