diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 8b3b10f..5842f8c 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -47,7 +47,7 @@ def extract_text_by_page(file_path): return result -def process_pages(get_text, total_pages): +def process_pages(get_text, common_header, total_pages): """ 扫描所有页面,返回第一个和最后一个包含有效文本的页面索引。 如果所有页面都为空,则返回 0 到 total_pages - 1。 @@ -57,6 +57,7 @@ def process_pages(get_text, total_pages): for page_num in range(total_pages): try: text = get_text(page_num) + cleaned_text = clean_page_content(text, common_header) except Exception as e: print(f"读取第 {page_num} 页失败: {e}") continue @@ -75,12 +76,14 @@ def process_pages(get_text, total_pages): return start_page, end_page def read_pdf_main(pdf_path): + # common_header=extract_common_header(pdf_path) + common_header="" try: with open(pdf_path, "rb") as f: pdf_document = PdfReader(f) total_pages = len(pdf_document.pages) get_text = create_get_text_function('pypdf2', pdf_document) - start_page, end_page = process_pages(get_text, total_pages) + start_page, end_page = process_pages(get_text, common_header,total_pages) return start_page, end_page except Exception as e_pypdf2: print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") @@ -90,7 +93,7 @@ def read_pdf_main(pdf_path): with fitz.open(pdf_path) as pdf_document: total_pages = pdf_document.page_count get_text = create_get_text_function('fitz', pdf_document) - start_page, end_page = process_pages(get_text, total_pages) + start_page, end_page = process_pages(get_text, common_header,total_pages) return start_page, end_page except Exception as e_pypdf2: print(f"extract_pages_generic: 使用 fitz 读取 PDF 失败: {e_pypdf2}") diff --git a/flask_app/routes/test_preprocess.py b/flask_app/routes/test_preprocess.py index 45038ae..04c8990 100644 --- a/flask_app/routes/test_preprocess.py +++ b/flask_app/routes/test_preprocess.py @@ -1,3 +1,5 @@ +import os + from flask import request, jsonify, Blueprint, g import uuid import time @@ -23,10 +25,9 @@ def process_file(): return jsonify({'error': 'Missing file_url parameter'}) # 生成唯一文件名 - file_ext = '.pdf' - filename = f"{uuid.uuid4().hex}{file_ext}" + filename = os.path.join(output_folder,'ztbfile.pdf') file_path,file_type=download_file(file_url, filename) - print(file_path) + # print(file_path) # 调用预处理函数 start_time = time.time() result = preprocess_files( diff --git a/flask_app/routes/test_readpdf.py b/flask_app/routes/test_readpdf.py index 7d130ed..4110bd7 100644 --- a/flask_app/routes/test_readpdf.py +++ b/flask_app/routes/test_readpdf.py @@ -1,3 +1,5 @@ +import os.path + from flask import request, jsonify, Blueprint, g import uuid import time @@ -24,8 +26,7 @@ def process_file(): return jsonify({'error': 'Missing file_url parameter'}) # 生成唯一文件名 - file_ext = '.pdf' - filename = f"{uuid.uuid4().hex}{file_ext}" + filename = os.path.join(output_folder,'ztbfile.pdf') file_path,file_type=download_file(file_url, filename) # print(file_path) # 调用预处理函数