79 lines
2.7 KiB
Python
Raw Permalink Normal View History

2024-09-20 18:01:48 +08:00
import re
2024-09-27 15:00:30 +08:00
from unittest.mock import Mock
2024-09-20 18:01:48 +08:00
2024-09-27 15:00:30 +08:00
def clean_page_content(text, common_header):
# 模拟清理页面内容的函数
return text.replace(common_header, '')
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
start_page = None
mid_page = None
end_page = None
for i, page in enumerate(pdf_document.pages):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and mid_page is None and re.search(r'^一\s*[、.]\s*(说\s*明|总\s*则)', cleaned_text):
mid_page = i
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
# 创建模拟的PDF文档
def create_mock_pdf(page_contents):
pdf_mock = Mock()
pdf_mock.pages = [Mock(extract_text=lambda: content) for content in page_contents]
return pdf_mock
# 测试用例
def test_extract_pages():
# 测试用例1正常情况
pdf1 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一、说明",
"更多内容",
"结束标记"
])
result1 = extract_pages_tobidders_notice(pdf1, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 1:", result1) # 期望输出:(1, 3, 5)
# 测试用例2没有中间页
pdf2 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"结束标记"
])
result2 = extract_pages_tobidders_notice(pdf2, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 2:", result2) # 期望输出:(1, None, 3)
# 测试用例3使用"总则"而不是"说明"
pdf3 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一、总则",
"更多内容",
"结束标记"
])
result3 = extract_pages_tobidders_notice(pdf3, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 3:", result3) # 期望输出:(1, 3, 5)
# 测试用例4中间页格式变化
pdf4 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一.说 明",
"更多内容",
"结束标记"
])
result4 = extract_pages_tobidders_notice(pdf4, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 4:", result4) # 期望输出:(1, 3, 5)
# 运行测试
test_extract_pages()