This commit is contained in:
zhangsan 2024-08-03 09:31:32 +08:00
parent 6f08351615
commit d6c3fd8366
4 changed files with 87 additions and 73 deletions

View File

@ -18,14 +18,8 @@ COPY requirements.txt .
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
# Copy the rest of the application code
COPY . /reptile
WORKDIR /reptile
# Set environment variables
ENV BASE_PAGE_URL=${BASE_PAGE_URL}
ENV BASE_URL=${BASE_URL}
ENV DATE_LIMIT=${DATE_LIMIT}
ENV OUTPUT_PATH_PREFIX=${OUTPUT_PATH_PREFIX}
COPY . /reptile2
WORKDIR /reptile2
# Run the Python script
CMD ["python", "scrape.py"]
CMD ["python", "main_extraction.py"]

View File

@ -11,10 +11,5 @@ services:
build: .
depends_on:
- chrome
environment:
- BASE_PAGE_URL=https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382
- BASE_URL=https://www.cbirc.gov.cn/cn/view/pages/
- DATE_LIMIT=2003-12-24
- OUTPUT_PATH_PREFIX=url
volumes:
- .:/reptile
- .:/reptile2

View File

@ -52,6 +52,17 @@ def process_table(table_rows):
results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1]))
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
elif len(table_rows) == 8:
results["行政处罚决定书文号"] = clean_text(str(table_rows[0].find_all('td')[1]))
org_name=clean_text(str(table_rows[1].find_all('td')[2]))
name=clean_text(str(table_rows[2].find_all('td')[1]))
results["被处罚当事人"] = f'"单位名称": "{org_name}"\n"主要负责人姓名": "{name}"'
results["主要违法违规事实"] = clean_text(str(table_rows[3].find_all('td')[1]))
results["行政处罚依据"] = clean_text(str(table_rows[4].find_all('td')[1]))
results["行政处罚决定"] = clean_text(str(table_rows[5].find_all('td')[1]))
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[6].find_all('td')[1]))
results["作出处罚决定的日期"] = clean_text(str(table_rows[7].find_all('td')[1]))
else:
results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1]))
results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1]))
@ -66,7 +77,7 @@ def process_table(table_rows):
return results
def fetch_data(urls):
def create_browser():
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 使用无头模式
options.add_argument('--disable-blink-features=AutomationControlled')
@ -74,27 +85,41 @@ def fetch_data(urls):
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
window.navigator.chrome = {
runtime: {}
};
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
'''
})
driver = webdriver.Remote(
command_executor='http://chrome:4444/wd/hub',
options=options
)
return driver
def fetch_data(urls):
# options = webdriver.ChromeOptions()
# options.add_argument('--headless') # 使用无头模式
# options.add_argument('--disable-blink-features=AutomationControlled')
# options.add_argument(
# 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# options.add_experimental_option('useAutomationExtension', False)
# driver = webdriver.Chrome(options=options)
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
# 'source': '''
# Object.defineProperty(navigator, 'webdriver', {
# get: () => undefined
# });
# window.navigator.chrome = {
# runtime: {}
# };
# Object.defineProperty(navigator, 'languages', {
# get: () => ['en-US', 'en']
# });
# Object.defineProperty(navigator, 'plugins', {
# get: () => [1, 2, 3, 4, 5]
# });
# '''
# })
driver = create_browser()
all_data = pd.DataFrame()
error_urls = []
for url in urls:
try:
driver.get(url)

View File

@ -17,33 +17,6 @@ def random_wait(min_time=1, max_time=5):
# Function to create a new browser session with options to avoid detection
# def create_browser():
# options = webdriver.ChromeOptions()
# options.add_argument("--disable-blink-features=AutomationControlled")
# options.add_argument("--headless") # Enable headless mode
# options.add_argument(
# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# options.add_experimental_option('useAutomationExtension', False)
# driver = webdriver.Chrome(options=options)
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
# 'source': '''
# Object.defineProperty(navigator, 'webdriver', {
# get: () => undefined
# });
# window.navigator.chrome = {
# runtime: {}
# };
# Object.defineProperty(navigator, 'languages', {
# get: () => ['en-US', 'en']
# });
# Object.defineProperty(navigator, 'plugins', {
# get: () => [1, 2, 3, 4, 5]
# });
# '''
# })
# return driver
def create_browser():
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
@ -52,12 +25,39 @@ def create_browser():
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Remote(
command_executor='http://chrome:4444/wd/hub',
options=options
)
driver = webdriver.Chrome(options=options)
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': '''
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
});
window.navigator.chrome = {
runtime: {}
};
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
'''
})
return driver
# def create_browser():
# options = webdriver.ChromeOptions()
# options.add_argument("--disable-blink-features=AutomationControlled")
# options.add_argument("--headless") # Enable headless mode
# options.add_argument(
# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
# options.add_experimental_option('useAutomationExtension', False)
# driver = webdriver.Remote(
# command_executor='http://chrome:4444/wd/hub',
# options=options
# )
# return driver
def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
# Initialize WebDriver
@ -164,13 +164,13 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
# Example usage
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
# date_limit = datetime(2003, 12, 24)
# output_path_prefix = 'url'
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4113&itemUrl=ItemListRightList.html&itemName=%E6%80%BB%E5%B1%80%E6%9C%BA%E5%85%B3&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
date_limit = datetime(2006, 4, 26)
output_path_prefix = 'url'
# 从环境变量读取参数
base_page_url = os.getenv('BASE_PAGE_URL')
base_url = os.getenv('BASE_URL')
date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
# base_page_url = os.getenv('BASE_PAGE_URL')
# base_url = os.getenv('BASE_URL')
# date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
# output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)