2024-09-27 15:00:30 +08:00

79 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
from unittest.mock import Mock
def clean_page_content(text, common_header):
# 模拟清理页面内容的函数
return text.replace(common_header, '')
def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
start_page = None
mid_page = None
end_page = None
for i, page in enumerate(pdf_document.pages):
text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header)
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and mid_page is None and re.search(r'^一\s*[、.]\s*(说\s*明|总\s*则)', cleaned_text):
mid_page = i
if start_page is not None and mid_page is not None and re.search(end_pattern, cleaned_text) and i > mid_page:
end_page = i
break
return start_page, mid_page, end_page
# 创建模拟的PDF文档
def create_mock_pdf(page_contents):
pdf_mock = Mock()
pdf_mock.pages = [Mock(extract_text=lambda: content) for content in page_contents]
return pdf_mock
# 测试用例
def test_extract_pages():
# 测试用例1正常情况
pdf1 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一、说明",
"更多内容",
"结束标记"
])
result1 = extract_pages_tobidders_notice(pdf1, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 1:", result1) # 期望输出:(1, 3, 5)
# 测试用例2没有中间页
pdf2 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"结束标记"
])
result2 = extract_pages_tobidders_notice(pdf2, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 2:", result2) # 期望输出:(1, None, 3)
# 测试用例3使用"总则"而不是"说明"
pdf3 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一、总则",
"更多内容",
"结束标记"
])
result3 = extract_pages_tobidders_notice(pdf3, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 3:", result3) # 期望输出:(1, 3, 5)
# 测试用例4中间页格式变化
pdf4 = create_mock_pdf([
"Page 0",
"第一章 投标人须知",
"其他内容",
"一.说 明",
"更多内容",
"结束标记"
])
result4 = extract_pages_tobidders_notice(pdf4, r'第.+章\s*投标人须知', r'结束标记', -1, '')
print("Test Case 4:", result4) # 期望输出:(1, 3, 5)
# 运行测试
test_extract_pages()