second
This commit is contained in:
parent
b41b0ed027
commit
bda2c87625
@ -170,17 +170,17 @@ def process_in_batches(urls, batch_size=100):
|
||||
batch_data = fetch_data(batch_urls)
|
||||
|
||||
try:
|
||||
existing_data = pd.read_excel('output_data.xlsx', sheet_name='Sheet1')
|
||||
existing_data = pd.read_excel('output_data2.xlsx', sheet_name='Sheet1')
|
||||
combined_data = pd.concat([existing_data, batch_data], ignore_index=True)
|
||||
except FileNotFoundError:
|
||||
combined_data = batch_data
|
||||
|
||||
with pd.ExcelWriter('output_data.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
|
||||
with pd.ExcelWriter('output_data2.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
|
||||
combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
|
||||
|
||||
|
||||
# 读取URL列表
|
||||
with open('urls.txt', 'r') as file:
|
||||
with open('url2.txt', 'r') as file:
|
||||
urls = [line.strip() for line in file if line.strip()]
|
||||
|
||||
# 分批处理URL并写入Excel
|
||||
|
BIN
output_data.xlsx
BIN
output_data.xlsx
Binary file not shown.
@ -45,9 +45,10 @@ def create_browser():
|
||||
|
||||
# Initialize WebDriver
|
||||
driver = create_browser()
|
||||
|
||||
#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||
# Base URL information
|
||||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||
#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||
|
||||
# Set to store unique URLs
|
||||
@ -57,7 +58,7 @@ unique_urls = set()
|
||||
# Function to check date
|
||||
def is_date_valid(date_text):
|
||||
given_date = datetime.strptime(date_text, "%Y-%m-%d")
|
||||
return given_date >= datetime(2023, 6, 1)
|
||||
return given_date >= datetime(2023, 11, 28)
|
||||
|
||||
|
||||
# Visit the initial page
|
||||
@ -124,7 +125,7 @@ driver.quit()
|
||||
cnt = 0
|
||||
|
||||
# Open a file to write
|
||||
with open('urls.txt', 'w') as file:
|
||||
with open('url2.txt', 'w') as file:
|
||||
for url in unique_urls:
|
||||
cnt += 1
|
||||
file.write(url + '\n') # Write each URL followed by a newline
|
||||
|
@ -1 +0,0 @@
|
||||
https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1116560&itemId=4115&generaltype=9
|
Loading…
x
Reference in New Issue
Block a user