import re from unittest.mock import Mock def clean_page_content(text, common_header): # 模拟清理页面内容的函数 return text.replace(common_header, '') def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header): start_page = None mid_page = None end_page = None for i, page in enumerate(pdf_document.pages): text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: start_page = i if start_page is not None and mid_page is None and re.search(r'^一\s*[、.]\s*(说\s*明|总\s*则)', cleaned_text): mid_page = i if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page: end_page = i break return start_page, mid_page, end_page # 创建模拟的PDF文档 def create_mock_pdf(page_contents): pdf_mock = Mock() pdf_mock.pages = [Mock(extract_text=lambda: content) for content in page_contents] return pdf_mock # 测试用例 def test_extract_pages(): # 测试用例1:正常情况 pdf1 = create_mock_pdf([ "Page 0", "第一章 投标人须知", "其他内容", "一、说明", "更多内容", "结束标记" ]) result1 = extract_pages_tobidders_notice(pdf1, r'第.+章\s*投标人须知', r'结束标记', -1, '') print("Test Case 1:", result1) # 期望输出:(1, 3, 5) # 测试用例2:没有中间页 pdf2 = create_mock_pdf([ "Page 0", "第一章 投标人须知", "其他内容", "结束标记" ]) result2 = extract_pages_tobidders_notice(pdf2, r'第.+章\s*投标人须知', r'结束标记', -1, '') print("Test Case 2:", result2) # 期望输出:(1, None, 3) # 测试用例3:使用"总则"而不是"说明" pdf3 = create_mock_pdf([ "Page 0", "第一章 投标人须知", "其他内容", "一、总则", "更多内容", "结束标记" ]) result3 = extract_pages_tobidders_notice(pdf3, r'第.+章\s*投标人须知', r'结束标记', -1, '') print("Test Case 3:", result3) # 期望输出:(1, 3, 5) # 测试用例4:中间页格式变化 pdf4 = create_mock_pdf([ "Page 0", "第一章 投标人须知", "其他内容", "一.说 明", "更多内容", "结束标记" ]) result4 = extract_pages_tobidders_notice(pdf4, r'第.+章\s*投标人须知', r'结束标记', -1, '') print("Test Case 4:", result4) # 期望输出:(1, 3, 5) # 运行测试 test_extract_pages()