79 lines
2.7 KiB
Python
79 lines
2.7 KiB
Python
import re
|
||
from unittest.mock import Mock
|
||
|
||
def clean_page_content(text, common_header):
|
||
# 模拟清理页面内容的函数
|
||
return text.replace(common_header, '')
|
||
|
||
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
|
||
start_page = None
|
||
mid_page = None
|
||
end_page = None
|
||
for i, page in enumerate(pdf_document.pages):
|
||
text = page.extract_text() or ""
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||
start_page = i
|
||
if start_page is not None and mid_page is None and re.search(r'^一\s*[、.]\s*(说\s*明|总\s*则)', cleaned_text):
|
||
mid_page = i
|
||
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
|
||
end_page = i
|
||
break
|
||
return start_page, mid_page, end_page
|
||
|
||
# 创建模拟的PDF文档
|
||
def create_mock_pdf(page_contents):
|
||
pdf_mock = Mock()
|
||
pdf_mock.pages = [Mock(extract_text=lambda: content) for content in page_contents]
|
||
return pdf_mock
|
||
|
||
# 测试用例
|
||
def test_extract_pages():
|
||
# 测试用例1:正常情况
|
||
pdf1 = create_mock_pdf([
|
||
"Page 0",
|
||
"第一章 投标人须知",
|
||
"其他内容",
|
||
"一、说明",
|
||
"更多内容",
|
||
"结束标记"
|
||
])
|
||
result1 = extract_pages_tobidders_notice(pdf1, r'第.+章\s*投标人须知', r'结束标记', -1, '')
|
||
print("Test Case 1:", result1) # 期望输出:(1, 3, 5)
|
||
|
||
# 测试用例2:没有中间页
|
||
pdf2 = create_mock_pdf([
|
||
"Page 0",
|
||
"第一章 投标人须知",
|
||
"其他内容",
|
||
"结束标记"
|
||
])
|
||
result2 = extract_pages_tobidders_notice(pdf2, r'第.+章\s*投标人须知', r'结束标记', -1, '')
|
||
print("Test Case 2:", result2) # 期望输出:(1, None, 3)
|
||
|
||
# 测试用例3:使用"总则"而不是"说明"
|
||
pdf3 = create_mock_pdf([
|
||
"Page 0",
|
||
"第一章 投标人须知",
|
||
"其他内容",
|
||
"一、总则",
|
||
"更多内容",
|
||
"结束标记"
|
||
])
|
||
result3 = extract_pages_tobidders_notice(pdf3, r'第.+章\s*投标人须知', r'结束标记', -1, '')
|
||
print("Test Case 3:", result3) # 期望输出:(1, 3, 5)
|
||
|
||
# 测试用例4:中间页格式变化
|
||
pdf4 = create_mock_pdf([
|
||
"Page 0",
|
||
"第一章 投标人须知",
|
||
"其他内容",
|
||
"一.说 明",
|
||
"更多内容",
|
||
"结束标记"
|
||
])
|
||
result4 = extract_pages_tobidders_notice(pdf4, r'第.+章\s*投标人须知', r'结束标记', -1, '')
|
||
print("Test Case 4:", result4) # 期望输出:(1, 3, 5)
|
||
|
||
# 运行测试
|
||
test_extract_pages() |