diff --git a/README.md b/README.md
index 4f8132d..4da57bf 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,2 @@
-ww
-111
+scrape.py脚本负责抓取网址
+main_extraction.py负责抓取网页内容
diff --git a/urls.txt b/error_urls.txt
similarity index 100%
rename from urls.txt
rename to error_urls.txt
diff --git a/main_extraction.py b/main_extraction.py
index 3184155..cefca38 100644
--- a/main_extraction.py
+++ b/main_extraction.py
@@ -4,19 +4,17 @@ from selenium import webdriver
 from selenium.webdriver.common.action_chains import ActionChains
 import time
 import random
-
+import os
 
 def clean_text(html_content):
     soup = BeautifulSoup(html_content, 'html.parser')
     paragraphs = soup.find_all('p')
     lines = []
     for p in paragraphs:
-        # 如果 span 有子元素，比如 <span>（一）</span>，就忽略 span 标签
         line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)])
         lines.append(line)
     return '\n'.join(lines).strip()
 
-
 def process_table(table_rows):
     results = {
         "行政处罚决定书文号": "",
@@ -53,30 +51,20 @@ def process_table(table_rows):
             results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1]))
             results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
             results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
-        #TODO:键固定，值动态
         else:
-            temp_dict = {}
-            for row in table_rows:
-                columns = row.find_all('td')
-                if len(columns) >= 2:
-                    header = columns[0].get_text(strip=True)
-                    if "违法违规" in header:
-                        header = "主要违法违规事实"
-                    if "机关名称" in header:
-                        header = "作出处罚决定的机关名称"
-                    if "日期" in header:
-                        header = "作出处罚决定的日期"
-                    content_html = str(columns[1])
-                    content = clean_text(content_html)
-                    temp_dict[header] = content
-            results = temp_dict
+            results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1]))
+            results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1]))
+            results["主要违法违规事实"] = clean_text(str(table_rows[2].find_all_next('td')[1]))
+            results["行政处罚依据"] = clean_text(str(table_rows[3].find_all_next('td')[1]))
+            results["行政处罚决定"] = clean_text(str(table_rows[4].find_all_next('td')[1]))
+            results["作出处罚决定的机关名称"]  = clean_text(str(table_rows[5].find_all_next('td')[1]))
+            results["作出处罚决定的日期"] = clean_text(str(table_rows[6].find_all_next('td')[1]))
 
     except Exception as e:
         print(f"Error processing table: {e}")
 
     return results
 
-
 def fetch_data(urls):
     options = webdriver.ChromeOptions()
     options.add_argument('--headless')  # 使用无头模式
@@ -114,14 +102,13 @@ def fetch_data(urls):
             html = driver.page_source
             soup = BeautifulSoup(html, 'html.parser')
 
-            # 尝试不同的选择器
             selectors = [
                 '.Section0 .MsoNormalTable, .Section0 .MsoTableGrid',
                 '.Section1 .MsoNormalTable, .Section1 .MsoTableGrid',
                 '.WordSection1 .MsoNormalTable, .WordSection1 .MsoTableGrid',
-                '.Section0 table',  # 直接查找Section0内的table
-                '.Section1 table',  # 直接查找Section1内的table
-                '.WordSection1 table'  # 直接查找WordSection1内的table
+                '.Section0 table',
+                '.Section1 table',
+                '.WordSection1 table'
             ]
             table = None
             for selector in selectors:
@@ -152,15 +139,25 @@ def fetch_data(urls):
 
     return all_data
 
-
 def random_wait(min_time=1, max_time=3):
     time.sleep(random.uniform(min_time, max_time))
 
+def create_empty_excel(filename):
+    columns = ["行政处罚决定书文号", "被处罚当事人", "主要违法违规事实", "行政处罚依据", "行政处罚决定", "作出处罚决定的机关名称", "作出处罚决定的日期"]
+    df = pd.DataFrame(columns=columns)
+    df.to_excel(filename, index=False)
 
 def process_in_batches(urls, batch_size=100):
     total_urls = len(urls)
     num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0)
 
+    file_index = 1
+    output_filename = f'output_data{file_index}.xlsx'
+    rows_in_file = 0
+
+    if not os.path.exists(output_filename):
+        create_empty_excel(output_filename)
+
     for batch_num in range(num_batches):
         start_index = batch_num * batch_size
         end_index = start_index + batch_size
@@ -170,19 +167,28 @@ def process_in_batches(urls, batch_size=100):
         batch_data = fetch_data(batch_urls)
 
         try:
-            existing_data = pd.read_excel('output_data2.xlsx', sheet_name='Sheet1')
+            existing_data = pd.read_excel(output_filename, sheet_name='Sheet1')
             combined_data = pd.concat([existing_data, batch_data], ignore_index=True)
         except FileNotFoundError:
             combined_data = batch_data
 
-        with pd.ExcelWriter('output_data2.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
+        with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
             combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
 
-#TODO:初始创建表头字段
+        rows_in_file += batch_data.shape[0]
+
+        if rows_in_file >= 10000:
+            file_index += 1
+            output_filename = f'output_data{file_index}.xlsx'
+            rows_in_file = 0
+
+            if not os.path.exists(output_filename):
+                create_empty_excel(output_filename)
+
 # 读取URL列表
-with open('url2.txt', 'r') as file:
+with open('url1.txt', 'r') as file:
     urls = [line.strip() for line in file if line.strip()]
 
 # 分批处理URL并写入Excel
-process_in_batches(urls, batch_size=50)
+process_in_batches(urls, batch_size=100)
 print("Data has been appended to the existing Excel file.")
diff --git a/output_data.xlsx b/output_data.xlsx
deleted file mode 100644
index 520cdfb..0000000
Binary files a/output_data.xlsx and /dev/null differ
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7497ab6
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,30 @@
+attrs==23.2.0
+beautifulsoup4==4.12.3
+certifi==2024.7.4
+cffi==1.16.0
+charset-normalizer==3.3.2
+et-xmlfile==1.1.0
+exceptiongroup==1.2.2
+h11==0.14.0
+idna==3.7
+numpy==1.24.4
+openpyxl==3.1.5
+outcome==1.3.0.post0
+pandas==2.0.3
+pycparser==2.22
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+pytz==2024.1
+requests==2.32.3
+selenium==4.23.1
+six==1.16.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.5
+trio==0.26.0
+trio-websocket==0.11.1
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2
+websocket-client==1.8.0
+wsproto==1.2.0
diff --git a/scrape.py b/scrape.py
index 696ecf4..40456fa 100644
--- a/scrape.py
+++ b/scrape.py
@@ -18,7 +18,7 @@ def random_wait(min_time=1, max_time=5):
 def create_browser():
     options = webdriver.ChromeOptions()
     options.add_argument("--disable-blink-features=AutomationControlled")
-    # options.add_argument("--headless")  # Uncomment this line to use headless mode
+    options.add_argument("--headless")  # Enable headless mode
     options.add_argument(
         "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
     options.add_experimental_option('excludeSwitches', ['enable-automation'])
@@ -43,92 +43,113 @@ def create_browser():
     return driver
 
 
-# Initialize WebDriver
-driver = create_browser()
-#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
-# Base URL information
-#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
-base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
+def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
+    # Initialize WebDriver
+    driver = create_browser()
+
+    # Set to store unique URLs
+    unique_urls = set()
+    total_urls_saved = 0  # Total count of URLs saved
+    urls_in_current_file = 0  # Count of URLs in the current file
+
+    # Function to check date
+    def is_date_valid(date_text):
+        given_date = datetime.strptime(date_text, "%Y-%m-%d")
+        return given_date >= date_limit
+
+    # Function to save URLs to file
+    def save_urls_to_file(urls, file_index):
+        nonlocal total_urls_saved
+        nonlocal urls_in_current_file
+
+        with open(f"{output_path_prefix}{file_index}.txt", 'a') as file:
+            for url in urls:
+                file.write(url + '\n')
+                total_urls_saved += 1
+                urls_in_current_file += 1
+        print(f"URLs have been saved to {output_path_prefix}{file_index}.txt")
+
+    # Visit the initial page
+    driver.get(base_page_url)
+    cur_page = 0
+    file_index = 1
+
+    # Keep processing until a date before the given date_limit is found
+    while True:
+        cur_page += 1
+        print("Visiting new page:" + str(cur_page))
+
+        # Wait for JavaScript to load
+        random_wait()
+
+        # Get the page source after JS execution
+        html = driver.page_source
+
+        # Parse the HTML using BeautifulSoup
+        soup = BeautifulSoup(html, 'html.parser')
+
+        # Find all <div> elements that match class conditions
+        div_elements = soup.find_all('div', class_="panel-row ng-scope")
+
+        # Variable to determine if loop should continue
+        should_continue = False
+
+        # Iterate through the div elements to find links and dates
+        for div in div_elements:
+            date_span = div.find('span', class_='date ng-binding')
+            if date_span:
+                date_text = date_span.text.strip()
+                if is_date_valid(date_text):
+                    should_continue = True
+                    link = div.find('a', href=True, attrs={'ng-bind-html': 'x.docSubtitle|trustHtml'})
+                    if link and "处罚信息公开表" in link.text:
+                        href = link['href']
+                        full_url = base_url + href
+                        if "//cn/view/pages/" not in full_url:
+                            unique_urls.add(full_url)
+                else:
+                    # Since this date is invalid and dates are sorted in descending order, no need to continue
+                    should_continue = False
+                    break
+
+        # Save URLs if they exceed 2000 and reset unique_urls
+        if len(unique_urls) >= 10:
+            save_urls_to_file(unique_urls, file_index)
+            unique_urls.clear()
+
+            # If the current file exceeds 20000 URLs, start a new file
+            if urls_in_current_file >= 20:
+                file_index += 1
+                urls_in_current_file = 0
+
+        # Check if loop should continue
+        if not should_continue:
+            break
+
+        # Try to find and click the next page button
+        try:
+            next_button = WebDriverWait(driver, 10).until(
+                EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']"))
+            )
+            ActionChains(driver).move_to_element(next_button).click().perform()
+            random_wait()  # Wait for the next page to load
+        except Exception as e:
+            print("No more pages or error occurred:", e)
+            break
+
+    # Save remaining URLs if any
+    if unique_urls:
+        save_urls_to_file(unique_urls, file_index)
+
+    # Close the browser
+    driver.quit()
+
+    print("Total URLs saved:", total_urls_saved)
+
+
+# Example usage
+base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
 base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
-
-# Set to store unique URLs
-unique_urls = set()
-
-
-# Function to check date
-def is_date_valid(date_text):
-    given_date = datetime.strptime(date_text, "%Y-%m-%d")
-    return given_date >= datetime(2023, 11, 28)
-
-
-# Visit the initial page
-driver.get(base_page_url)
-cur_page = 0
-
-# Keep processing until a date before June 1, 2023, is found
-while True:
-    cur_page += 1
-    print("Visiting new page:" + str(cur_page))
-
-    # Wait for JavaScript to load
-    random_wait()
-
-    # Get the page source after JS execution
-    html = driver.page_source
-
-    # Parse the HTML using BeautifulSoup
-    soup = BeautifulSoup(html, 'html.parser')
-
-    # Find all <div> elements that match class conditions
-    div_elements = soup.find_all('div', class_="panel-row ng-scope")
-
-    # Variable to determine if loop should continue
-    should_continue = False
-
-    # Iterate through the div elements to find links and dates
-    for div in div_elements:
-        date_span = div.find('span', class_='date ng-binding')
-        if date_span:
-            date_text = date_span.text.strip()
-            if is_date_valid(date_text):
-                should_continue = True
-                link = div.find('a', href=True, attrs={'ng-bind-html': 'x.docSubtitle|trustHtml'})
-                if link and "处罚信息公开表" in link.text:
-                    href = link['href']
-                    full_url = base_url + href
-                    if "//cn/view/pages/" not in full_url:
-                        unique_urls.add(full_url)
-            else:
-                # Since this date is invalid and dates are sorted in descending order, no need to continue
-                should_continue = False
-                break
-
-    # Check if loop should continue
-    if not should_continue:
-        break
-
-    # Try to find and click the next page button
-    try:
-        next_button = WebDriverWait(driver, 10).until(
-            EC.element_to_be_clickable((By.XPATH, "//a[@ng-click='pager.next()']"))
-        )
-        ActionChains(driver).move_to_element(next_button).click().perform()
-        random_wait()  # Wait for the next page to load
-    except Exception as e:
-        print("No more pages or error occurred:", e)
-        break
-
-# Close the browser
-driver.quit()
-
-# Print all unique URLs and count them
-cnt = 0
-
-# Open a file to write
-with open('url2.txt', 'w') as file:
-    for url in unique_urls:
-        cnt += 1
-        file.write(url + '\n')  # Write each URL followed by a newline
-
-print("URLs have been saved to urls1.txt")
-print("Total URLs found:", cnt)
+date_limit = datetime(2024, 7, 24)
+output_path_prefix = 'url'
+fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)