2024.8.2
This commit is contained in:
parent
6f08351615
commit
d6c3fd8366
12
Dockerfile
12
Dockerfile
@ -18,14 +18,8 @@ COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -i https://mirrors.aliyun.com/pypi/simple/ -r requirements.txt
|
||||
|
||||
# Copy the rest of the application code
|
||||
COPY . /reptile
|
||||
WORKDIR /reptile
|
||||
|
||||
# Set environment variables
|
||||
ENV BASE_PAGE_URL=${BASE_PAGE_URL}
|
||||
ENV BASE_URL=${BASE_URL}
|
||||
ENV DATE_LIMIT=${DATE_LIMIT}
|
||||
ENV OUTPUT_PATH_PREFIX=${OUTPUT_PATH_PREFIX}
|
||||
COPY . /reptile2
|
||||
WORKDIR /reptile2
|
||||
|
||||
# Run the Python script
|
||||
CMD ["python", "scrape.py"]
|
||||
CMD ["python", "main_extraction.py"]
|
||||
|
@ -11,10 +11,5 @@ services:
|
||||
build: .
|
||||
depends_on:
|
||||
- chrome
|
||||
environment:
|
||||
- BASE_PAGE_URL=https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%B1%80%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382
|
||||
- BASE_URL=https://www.cbirc.gov.cn/cn/view/pages/
|
||||
- DATE_LIMIT=2003-12-24
|
||||
- OUTPUT_PATH_PREFIX=url
|
||||
volumes:
|
||||
- .:/reptile
|
||||
- .:/reptile2
|
||||
|
@ -52,6 +52,17 @@ def process_table(table_rows):
|
||||
results["行政处罚决定"] = clean_text(str(table_rows[7].find_all('td')[1]))
|
||||
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[8].find_all('td')[1]))
|
||||
results["作出处罚决定的日期"] = clean_text(str(table_rows[9].find_all('td')[1]))
|
||||
elif len(table_rows) == 8:
|
||||
results["行政处罚决定书文号"] = clean_text(str(table_rows[0].find_all('td')[1]))
|
||||
org_name=clean_text(str(table_rows[1].find_all('td')[2]))
|
||||
name=clean_text(str(table_rows[2].find_all('td')[1]))
|
||||
results["被处罚当事人"] = f'"单位名称": "{org_name}"\n"主要负责人姓名": "{name}"'
|
||||
results["主要违法违规事实"] = clean_text(str(table_rows[3].find_all('td')[1]))
|
||||
results["行政处罚依据"] = clean_text(str(table_rows[4].find_all('td')[1]))
|
||||
results["行政处罚决定"] = clean_text(str(table_rows[5].find_all('td')[1]))
|
||||
results["作出处罚决定的机关名称"] = clean_text(str(table_rows[6].find_all('td')[1]))
|
||||
results["作出处罚决定的日期"] = clean_text(str(table_rows[7].find_all('td')[1]))
|
||||
|
||||
else:
|
||||
results["行政处罚决定书文号"]=clean_text(str(table_rows[0].find_all_next('td')[1]))
|
||||
results["被处罚当事人"] = clean_text(str(table_rows[1].find_all_next('td')[1]))
|
||||
@ -66,7 +77,7 @@ def process_table(table_rows):
|
||||
|
||||
return results
|
||||
|
||||
def fetch_data(urls):
|
||||
def create_browser():
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument('--headless') # 使用无头模式
|
||||
options.add_argument('--disable-blink-features=AutomationControlled')
|
||||
@ -74,27 +85,41 @@ def fetch_data(urls):
|
||||
'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
|
||||
options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
options.add_experimental_option('useAutomationExtension', False)
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
window.navigator.chrome = {
|
||||
runtime: {}
|
||||
};
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
});
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
});
|
||||
'''
|
||||
})
|
||||
driver = webdriver.Remote(
|
||||
command_executor='http://chrome:4444/wd/hub',
|
||||
options=options
|
||||
)
|
||||
return driver
|
||||
|
||||
def fetch_data(urls):
|
||||
# options = webdriver.ChromeOptions()
|
||||
# options.add_argument('--headless') # 使用无头模式
|
||||
# options.add_argument('--disable-blink-features=AutomationControlled')
|
||||
# options.add_argument(
|
||||
# 'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
|
||||
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
# options.add_experimental_option('useAutomationExtension', False)
|
||||
# driver = webdriver.Chrome(options=options)
|
||||
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
# 'source': '''
|
||||
# Object.defineProperty(navigator, 'webdriver', {
|
||||
# get: () => undefined
|
||||
# });
|
||||
# window.navigator.chrome = {
|
||||
# runtime: {}
|
||||
# };
|
||||
# Object.defineProperty(navigator, 'languages', {
|
||||
# get: () => ['en-US', 'en']
|
||||
# });
|
||||
# Object.defineProperty(navigator, 'plugins', {
|
||||
# get: () => [1, 2, 3, 4, 5]
|
||||
# });
|
||||
# '''
|
||||
# })
|
||||
driver = create_browser()
|
||||
|
||||
all_data = pd.DataFrame()
|
||||
error_urls = []
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
driver.get(url)
|
||||
|
78
scrape.py
78
scrape.py
@ -17,33 +17,6 @@ def random_wait(min_time=1, max_time=5):
|
||||
|
||||
|
||||
# Function to create a new browser session with options to avoid detection
|
||||
# def create_browser():
|
||||
# options = webdriver.ChromeOptions()
|
||||
# options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
# options.add_argument("--headless") # Enable headless mode
|
||||
# options.add_argument(
|
||||
# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
# options.add_experimental_option('useAutomationExtension', False)
|
||||
# driver = webdriver.Chrome(options=options)
|
||||
# driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
# 'source': '''
|
||||
# Object.defineProperty(navigator, 'webdriver', {
|
||||
# get: () => undefined
|
||||
# });
|
||||
# window.navigator.chrome = {
|
||||
# runtime: {}
|
||||
# };
|
||||
# Object.defineProperty(navigator, 'languages', {
|
||||
# get: () => ['en-US', 'en']
|
||||
# });
|
||||
# Object.defineProperty(navigator, 'plugins', {
|
||||
# get: () => [1, 2, 3, 4, 5]
|
||||
# });
|
||||
# '''
|
||||
# })
|
||||
# return driver
|
||||
|
||||
def create_browser():
|
||||
options = webdriver.ChromeOptions()
|
||||
options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
@ -52,12 +25,39 @@ def create_browser():
|
||||
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
options.add_experimental_option('useAutomationExtension', False)
|
||||
driver = webdriver.Remote(
|
||||
command_executor='http://chrome:4444/wd/hub',
|
||||
options=options
|
||||
)
|
||||
driver = webdriver.Chrome(options=options)
|
||||
driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
|
||||
'source': '''
|
||||
Object.defineProperty(navigator, 'webdriver', {
|
||||
get: () => undefined
|
||||
});
|
||||
window.navigator.chrome = {
|
||||
runtime: {}
|
||||
};
|
||||
Object.defineProperty(navigator, 'languages', {
|
||||
get: () => ['en-US', 'en']
|
||||
});
|
||||
Object.defineProperty(navigator, 'plugins', {
|
||||
get: () => [1, 2, 3, 4, 5]
|
||||
});
|
||||
'''
|
||||
})
|
||||
return driver
|
||||
|
||||
# def create_browser():
|
||||
# options = webdriver.ChromeOptions()
|
||||
# options.add_argument("--disable-blink-features=AutomationControlled")
|
||||
# options.add_argument("--headless") # Enable headless mode
|
||||
# options.add_argument(
|
||||
# "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
||||
# options.add_experimental_option('excludeSwitches', ['enable-automation'])
|
||||
# options.add_experimental_option('useAutomationExtension', False)
|
||||
# driver = webdriver.Remote(
|
||||
# command_executor='http://chrome:4444/wd/hub',
|
||||
# options=options
|
||||
# )
|
||||
# return driver
|
||||
|
||||
|
||||
def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
||||
# Initialize WebDriver
|
||||
@ -164,13 +164,13 @@ def fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix):
|
||||
|
||||
|
||||
# Example usage
|
||||
# base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4115&itemUrl=ItemListRightList.html&itemName=%E7%9B%91%E7%AE%A1%E5%88%86%E5%B1%80%E6%9C%AC%E7%BA%A7&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A#382"
|
||||
# base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||
# date_limit = datetime(2003, 12, 24)
|
||||
# output_path_prefix = 'url'
|
||||
base_page_url = "https://www.cbirc.gov.cn/cn/view/pages/ItemList.html?itemPId=923&itemId=4113&itemUrl=ItemListRightList.html&itemName=%E6%80%BB%E5%B1%80%E6%9C%BA%E5%85%B3&itemsubPId=931&itemsubPName=%E8%A1%8C%E6%94%BF%E5%A4%84%E7%BD%9A"
|
||||
base_url = 'https://www.cbirc.gov.cn/cn/view/pages/'
|
||||
date_limit = datetime(2006, 4, 26)
|
||||
output_path_prefix = 'url'
|
||||
# 从环境变量读取参数
|
||||
base_page_url = os.getenv('BASE_PAGE_URL')
|
||||
base_url = os.getenv('BASE_URL')
|
||||
date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
|
||||
output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
|
||||
# base_page_url = os.getenv('BASE_PAGE_URL')
|
||||
# base_url = os.getenv('BASE_URL')
|
||||
# date_limit = datetime.strptime(os.getenv('DATE_LIMIT'), "%Y-%m-%d")
|
||||
# output_path_prefix = os.getenv('OUTPUT_PATH_PREFIX')
|
||||
fetch_unique_urls(base_page_url, base_url, date_limit, output_path_prefix)
|
||||
|
Loading…
x
Reference in New Issue
Block a user