second
This commit is contained in:
parent
b41b0ed027
commit
bda2c87625
@ -170,17 +170,17 @@ def process_in_batches(urls, batch_size=100):
|
|||||||
batch_data = fetch_data(batch_urls)
|
batch_data = fetch_data(batch_urls)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
existing_data = pd.read_excel('output_data.xlsx', sheet_name='Sheet1')
|
existing_data = pd.read_excel('output_data2.xlsx', sheet_name='Sheet1')
|
||||||
combined_data = pd.concat([existing_data, batch_data], ignore_index=True)
|
combined_data = pd.concat([existing_data, batch_data], ignore_index=True)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
combined_data = batch_data
|
combined_data = batch_data
|
||||||
|
|
||||||
with pd.ExcelWriter('output_data.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
|
with pd.ExcelWriter('output_data2.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
|
||||||
combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
|
combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
|
||||||
|
|
||||||
|
|
||||||
# 读取URL列表
|
# 读取URL列表
|
||||||
with open('urls.txt', 'r') as file:
|
with open('url2.txt', 'r') as file:
|
||||||
urls = [line.strip() for line in file if line.strip()]
|
urls = [line.strip() for line in file if line.strip()]
|
||||||
|
|
||||||
# 分批处理URL并写入Excel
|
# 分批处理URL并写入Excel
|
||||||
|
BIN
output_data.xlsx
BIN
output_data.xlsx
Binary file not shown.
@ -45,9 +45,10 @@ def create_browser():
|
|||||||
|
|
||||||
# Initialize WebDriver
|
# Initialize WebDriver
|
||||||
driver = create_browser()
|
driver = create_browser()
|
||||||
|
#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||||
# Base URL information
|
# Base URL information
|
||||||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||||
|
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||||
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||||
|
|
||||||
# Set to store unique URLs
|
# Set to store unique URLs
|
||||||
@ -57,7 +58,7 @@ unique_urls = set()
|
|||||||
# Function to check date
|
# Function to check date
|
||||||
def is_date_valid(date_text):
|
def is_date_valid(date_text):
|
||||||
given_date = datetime.strptime(date_text, "%Y-%m-%d")
|
given_date = datetime.strptime(date_text, "%Y-%m-%d")
|
||||||
return given_date >= datetime(2023, 6, 1)
|
return given_date >= datetime(2023, 11, 28)
|
||||||
|
|
||||||
|
|
||||||
# Visit the initial page
|
# Visit the initial page
|
||||||
@ -124,7 +125,7 @@ driver.quit()
|
|||||||
cnt = 0
|
cnt = 0
|
||||||
|
|
||||||
# Open a file to write
|
# Open a file to write
|
||||||
with open('urls.txt', 'w') as file:
|
with open('url2.txt', 'w') as file:
|
||||||
for url in unique_urls:
|
for url in unique_urls:
|
||||||
cnt += 1
|
cnt += 1
|
||||||
file.write(url + '\n') # Write each URL followed by a newline
|
file.write(url + '\n') # Write each URL followed by a newline
|
||||||
|
@ -1 +0,0 @@
|
|||||||
https://www.cbirc.gov.cn/cn/view/pages/ItemDetail.html?docId=1116560&itemId=4115&generaltype=9
|
|
Loading…
x
Reference in New Issue
Block a user