2.17 增加读文件pdf接口测试1
This commit is contained in:
parent
01abc7eb9f
commit
e80be4aa9c
@ -47,7 +47,7 @@ def extract_text_by_page(file_path):
|
||||
return result
|
||||
|
||||
|
||||
def process_pages(get_text, total_pages):
|
||||
def process_pages(get_text, common_header, total_pages):
|
||||
"""
|
||||
扫描所有页面,返回第一个和最后一个包含有效文本的页面索引。
|
||||
如果所有页面都为空,则返回 0 到 total_pages - 1。
|
||||
@ -57,6 +57,7 @@ def process_pages(get_text, total_pages):
|
||||
for page_num in range(total_pages):
|
||||
try:
|
||||
text = get_text(page_num)
|
||||
cleaned_text = clean_page_content(text, common_header)
|
||||
except Exception as e:
|
||||
print(f"读取第 {page_num} 页失败: {e}")
|
||||
continue
|
||||
@ -75,12 +76,13 @@ def process_pages(get_text, total_pages):
|
||||
return start_page, end_page
|
||||
|
||||
def read_pdf_main(pdf_path):
|
||||
common_header=extract_common_header(pdf_path)
|
||||
try:
|
||||
with open(pdf_path, "rb") as f:
|
||||
pdf_document = PdfReader(f)
|
||||
total_pages = len(pdf_document.pages)
|
||||
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||
start_page, end_page = process_pages(get_text, total_pages)
|
||||
start_page, end_page = process_pages(get_text, common_header,total_pages)
|
||||
return start_page, end_page
|
||||
except Exception as e_pypdf2:
|
||||
print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||
@ -90,7 +92,7 @@ def read_pdf_main(pdf_path):
|
||||
with fitz.open(pdf_path) as pdf_document:
|
||||
total_pages = pdf_document.page_count
|
||||
get_text = create_get_text_function('fitz', pdf_document)
|
||||
start_page, end_page = process_pages(get_text, total_pages)
|
||||
start_page, end_page = process_pages(get_text, common_header,total_pages)
|
||||
return start_page, end_page
|
||||
except Exception as e_pypdf2:
|
||||
print(f"extract_pages_generic: 使用 fitz 读取 PDF 失败: {e_pypdf2}")
|
||||
|
Loading…
x
Reference in New Issue
Block a user