diff --git a/README.md b/README.md index 4f8132d..4da57bf 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ -ww -111 +scrape.py脚本负责抓取网址 +main_extraction.py负责抓取网页内容 diff --git a/urls.txt b/error_urls.txt similarity index 100% rename from urls.txt rename to error_urls.txt diff --git a/main_extraction.py b/main_extraction.py index 3184155..cefca38 100644 --- a/main_extraction.py +++ b/main_extraction.py @@ -4,19 +4,17 @@ from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains import time import random - +import os def clean_text(html_content): soup = BeautifulSoup(html_content, 'html.parser') paragraphs = soup.find_all('p') lines = [] for p in paragraphs: - # 如果 span 有子元素,比如 (一),就忽略 span 标签 line = ''.join([span.get_text(strip=True) for span in p.find_all('span', recursive=False)]) lines.append(line) return '\n'.join(lines).strip() - def process_table(table_rows): results = { "行政处罚决定书文号": "", @@ -53,30 +51,20 @@ def process_table(table_rows): results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1])) results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1])) results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1])) - #TODO:键固定,值动态 else: - temp_dict = {} - for row in table_rows: - columns = row.find_all('td') - if len(columns) >= 2: - header = columns[0].get_text(strip=True) - if "违法违规" in header: - header = "主要违法违规事实" - if "机关名称" in header: - header = "作出处罚决定的机关名称" - if "日期" in header: - header = "作出处罚决定的日期" - content_html = str(columns[1]) - content = clean_text(content_html) - temp_dict[header] = content - results = temp_dict + results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1])) + results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1])) + results["主要违法违规事实"] = clean_text(str(table_rows[2].find_all_next('td')[1])) + results["行政处罚依据"] = clean_text(str(table_rows[3].find_all_next('td')[1])) + results["行政处罚决定"] = clean_text(str(table_rows[4].find_all_next('td')[1])) + results["作出处罚决定的机关名称"] = clean_text(str(table_rows[5].find_all_next('td')[1])) + results["作出处罚决定的日期"] = clean_text(str(table_rows[6].find_all_next('td')[1])) except Exception as e: print(f"Error processing table: {e}") return results - def fetch_data(urls): options = webdriver.ChromeOptions() options.add_argument('--headless') # 使用无头模式 @@ -114,14 +102,13 @@ def fetch_data(urls): html = driver.page_source soup = BeautifulSoup(html, 'html.parser') - # 尝试不同的选择器 selectors = [ '.Section0 .MsoNormalTable, .Section0 .MsoTableGrid', '.Section1 .MsoNormalTable, .Section1 .MsoTableGrid', '.WordSection1 .MsoNormalTable, .WordSection1 .MsoTableGrid', - '.Section0 table', # 直接查找Section0内的table - '.Section1 table', # 直接查找Section1内的table - '.WordSection1 table' # 直接查找WordSection1内的table + '.Section0 table', + '.Section1 table', + '.WordSection1 table' ] table = None for selector in selectors: @@ -152,15 +139,25 @@ def fetch_data(urls): return all_data - def random_wait(min_time=1, max_time=3): time.sleep(random.uniform(min_time, max_time)) +def create_empty_excel(filename): + columns = ["行政处罚决定书文号", "被处罚当事人", "主要违法违规事实", "行政处罚依据", "行政处罚决定", "作出处罚决定的机关名称", "作出处罚决定的日期"] + df = pd.DataFrame(columns=columns) + df.to_excel(filename, index=False) def process_in_batches(urls, batch_size=100): total_urls = len(urls) num_batches = (total_urls // batch_size) + (1 if total_urls % batch_size != 0 else 0) + file_index = 1 + output_filename = f'output_data{file_index}.xlsx' + rows_in_file = 0 + + if not os.path.exists(output_filename): + create_empty_excel(output_filename) + for batch_num in range(num_batches): start_index = batch_num * batch_size end_index = start_index + batch_size @@ -170,19 +167,28 @@ def process_in_batches(urls, batch_size=100): batch_data = fetch_data(batch_urls) try: - existing_data = pd.read_excel('output_data2.xlsx', sheet_name='Sheet1') + existing_data = pd.read_excel(output_filename, sheet_name='Sheet1') combined_data = pd.concat([existing_data, batch_data], ignore_index=True) except FileNotFoundError: combined_data = batch_data - with pd.ExcelWriter('output_data2.xlsx', engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer: + with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer: combined_data.to_excel(writer, index=False, sheet_name='Sheet1') -#TODO:初始创建表头字段 + rows_in_file += batch_data.shape[0] + + if rows_in_file >= 10000: + file_index += 1 + output_filename = f'output_data{file_index}.xlsx' + rows_in_file = 0 + + if not os.path.exists(output_filename): + create_empty_excel(output_filename) + # 读取URL列表 -with open('url2.txt', 'r') as file: +with open('url1.txt', 'r') as file: urls = [line.strip() for line in file if line.strip()] # 分批处理URL并写入Excel -process_in_batches(urls, batch_size=50) +process_in_batches(urls, batch_size=100) print("Data has been appended to the existing Excel file.") diff --git a/output_data.xlsx b/output_data.xlsx deleted file mode 100644 index 520cdfb..0000000 Binary files a/output_data.xlsx and /dev/null differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..7497ab6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,30 @@ +attrs==23.2.0 +beautifulsoup4==4.12.3 +certifi==2024.7.4 +cffi==1.16.0 +charset-normalizer==3.3.2 +et-xmlfile==1.1.0 +exceptiongroup==1.2.2 +h11==0.14.0 +idna==3.7 +numpy==1.24.4 +openpyxl==3.1.5 +outcome==1.3.0.post0 +pandas==2.0.3 +pycparser==2.22 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +pytz==2024.1 +requests==2.32.3 +selenium==4.23.1 +six==1.16.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.5 +trio==0.26.0 +trio-websocket==0.11.1 +typing_extensions==4.12.2 +tzdata==2024.1 +urllib3==2.2.2 +websocket-client==1.8.0 +wsproto==1.2.0 diff --git a/scrape.py b/scrape.py index 696ecf4..40456fa 100644 --- a/scrape.py +++ b/scrape.py @@ -18,7 +18,7 @@ def random_wait(min_time=1, max_time=5): def create_browser(): options = webdriver.ChromeOptions() options.add_argument("--disable-blink-features=AutomationControlled") - # options.add_argument("--headless") # Uncomment this line to use headless mode + options.add_argument("--headless") # Enable headless mode options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") options.add_experimental_option('excludeSwitches', ['enable-automation']) @@ -43,92 +43,113 @@ def create_browser(): return driver -# Initialize WebDriver -driver = create_browser() -#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" -# Base URL information -#"https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" -base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4114&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" +def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix): + # Initialize WebDriver + driver = create_browser() + + # Set to store unique URLs + unique_urls = set() + total_urls_saved = 0 # Total count of URLs saved + urls_in_current_file = 0 # Count of URLs in the current file + + # Function to check date + def is_date_valid(date_text): + given_date = datetime.strptime(date_text, "%Y-%m-%d") + return given_date >= date_limit + + # Function to save URLs to file + def save_urls_to_file(urls, file_index): + nonlocal total_urls_saved + nonlocal urls_in_current_file + + with open(f"{output_path_prefix}{file_index}.txt", 'a') as file: + for url in urls: + file.write(url + '\n') + total_urls_saved += 1 + urls_in_current_file += 1 + print(f"URLs have been saved to {output_path_prefix}{file_index}.txt") + + # Visit the initial page + driver.get(base_page_url) + cur_page = 0 + file_index = 1 + + # Keep processing until a date before the given date_limit is found + while True: + cur_page += 1 + print("Visiting new page:" + str(cur_page)) + + # Wait for JavaScript to load + random_wait() + + # Get the page source after JS execution + html = driver.page_source + + # Parse the HTML using BeautifulSoup + soup = BeautifulSoup(html, 'html.parser') + + # Find all