diff --git a/Dockerfile b/Dockerfile index f78330d..97a0073 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,14 +18,8 @@ COPY requirements.txt . RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt # Copy the rest of the application code -COPY . /reptile -WORKDIR /reptile - -# Set environment variables -ENV BASE_PAGE_URL=${BASE_PAGE_URL} -ENV BASE_URL=${BASE_URL} -ENV DATE_LIMIT=${DATE_LIMIT} -ENV OUTPUT_PATH_PREFIX=${OUTPUT_PATH_PREFIX} +COPY . /reptile2 +WORKDIR /reptile2 # Run the Python script -CMD ["python", "scrape.py"] +CMD ["python", "main_extraction.py"] diff --git a/docker-compose.yaml b/docker-compose.yaml index c28b02f..f50a6fd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -11,10 +11,5 @@ services: build: . depends_on: - chrome - environment: - - BASE_PAGE_URL=https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382 - - BASE_URL=https://www.cbirc.gov.cn/cn/view/pages/ - - DATE_LIMIT=2003-12-24 - - OUTPUT_PATH_PREFIX=url volumes: - - .:/reptile + - .:/reptile2 diff --git a/main_extraction.py b/main_extraction.py index e338b91..faf4b9c 100644 --- a/main_extraction.py +++ b/main_extraction.py @@ -52,6 +52,17 @@ def process_table(table_rows): results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1])) results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1])) results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1])) + elif len(table_rows) == 8: + results["行政处罚决定书文号"] = clean_text(str(table_rows[0].find_all('td')[1])) + org_name=clean_text(str(table_rows[1].find_all('td')[2])) + name=clean_text(str(table_rows[2].find_all('td')[1])) + results["被处罚当事人"] = f'"单位名称": "{org_name}"\n"主要负责人姓名": "{name}"' + results["主要违法违规事实"] = clean_text(str(table_rows[3].find_all('td')[1])) + results["行政处罚依据"] = clean_text(str(table_rows[4].find_all('td')[1])) + results["行政处罚决定"] = clean_text(str(table_rows[5].find_all('td')[1])) + results["作出处罚决定的机关名称"] = clean_text(str(table_rows[6].find_all('td')[1])) + results["作出处罚决定的日期"] = clean_text(str(table_rows[7].find_all('td')[1])) + else: results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1])) results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1])) @@ -66,7 +77,7 @@ def process_table(table_rows): return results -def fetch_data(urls): +def create_browser(): options = webdriver.ChromeOptions() options.add_argument('--headless') # 使用无头模式 options.add_argument('--disable-blink-features=AutomationControlled') @@ -74,27 +85,41 @@ def fetch_data(urls): 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36') options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_experimental_option('useAutomationExtension', False) - driver = webdriver.Chrome(options=options) - driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { - 'source': ''' - Object.defineProperty(navigator, 'webdriver', { - get: () => undefined - }); - window.navigator.chrome = { - runtime: {} - }; - Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'] - }); - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5] - }); - ''' - }) + driver = webdriver.Remote( + command_executor='http://chrome:4444/wd/hub', + options=options + ) + return driver + +def fetch_data(urls): + # options = webdriver.ChromeOptions() + # options.add_argument('--headless') # 使用无头模式 + # options.add_argument('--disable-blink-features=AutomationControlled') + # options.add_argument( + # 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36') + # options.add_experimental_option('excludeSwitches', ['enable-automation']) + # options.add_experimental_option('useAutomationExtension', False) + # driver = webdriver.Chrome(options=options) + # driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { + # 'source': ''' + # Object.defineProperty(navigator, 'webdriver', { + # get: () => undefined + # }); + # window.navigator.chrome = { + # runtime: {} + # }; + # Object.defineProperty(navigator, 'languages', { + # get: () => ['en-US', 'en'] + # }); + # Object.defineProperty(navigator, 'plugins', { + # get: () => [1, 2, 3, 4, 5] + # }); + # ''' + # }) + driver = create_browser() all_data = pd.DataFrame() error_urls = [] - for url in urls: try: driver.get(url) diff --git a/scrape.py b/scrape.py index ab65088..647bcb0 100644 --- a/scrape.py +++ b/scrape.py @@ -17,33 +17,6 @@ def random_wait(min_time=1, max_time=5): # Function to create a new browser session with options to avoid detection -# def create_browser(): -# options = webdriver.ChromeOptions() -# options.add_argument("--disable-blink-features=AutomationControlled") -# options.add_argument("--headless") # Enable headless mode -# options.add_argument( -# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") -# options.add_experimental_option('excludeSwitches', ['enable-automation']) -# options.add_experimental_option('useAutomationExtension', False) -# driver = webdriver.Chrome(options=options) -# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { -# 'source': ''' -# Object.defineProperty(navigator, 'webdriver', { -# get: () => undefined -# }); -# window.navigator.chrome = { -# runtime: {} -# }; -# Object.defineProperty(navigator, 'languages', { -# get: () => ['en-US', 'en'] -# }); -# Object.defineProperty(navigator, 'plugins', { -# get: () => [1, 2, 3, 4, 5] -# }); -# ''' -# }) -# return driver - def create_browser(): options = webdriver.ChromeOptions() options.add_argument("--disable-blink-features=AutomationControlled") @@ -52,12 +25,39 @@ def create_browser(): "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") options.add_experimental_option('excludeSwitches', ['enable-automation']) options.add_experimental_option('useAutomationExtension', False) - driver = webdriver.Remote( - command_executor='http://chrome:4444/wd/hub', - options=options - ) + driver = webdriver.Chrome(options=options) + driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', { + 'source': ''' + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + window.navigator.chrome = { + runtime: {} + }; + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US', 'en'] + }); + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + ''' + }) return driver +# def create_browser(): +# options = webdriver.ChromeOptions() +# options.add_argument("--disable-blink-features=AutomationControlled") +# options.add_argument("--headless") # Enable headless mode +# options.add_argument( +# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") +# options.add_experimental_option('excludeSwitches', ['enable-automation']) +# options.add_experimental_option('useAutomationExtension', False) +# driver = webdriver.Remote( +# command_executor='http://chrome:4444/wd/hub', +# options=options +# ) +# return driver + def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix): # Initialize WebDriver @@ -164,13 +164,13 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix): # Example usage -# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382" -# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/' -# date_limit = datetime(2003, 12, 24) -# output_path_prefix = 'url' +base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4113&itemUrl=ItemListRightList.html&itemName=%E6%80%BB%E5%B1%80%E6%9C%BA%E5%85%B3&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A" +base_url = 'https://www.cbirc.gov.cn/cn/view/pages/' +date_limit = datetime(2006, 4, 26) +output_path_prefix = 'url' # 从环境变量读取参数 -base_page_url = os.getenv('BASE_PAGE_URL') -base_url = os.getenv('BASE_URL') -date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d") -output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX') +# base_page_url = os.getenv('BASE_PAGE_URL') +# base_url = os.getenv('BASE_URL') +# date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d") +# output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX') fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)