From e80be4aa9cedc1b1287d7267939e1c1bbf494ed2 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Mon, 17 Feb 2025 11:42:37 +0800
Subject: [PATCH 1/3] =?UTF-8?q?2.17=20=E5=A2=9E=E5=8A=A0=E8=AF=BB=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6pdf=E6=8E=A5=E5=8F=A3=E6=B5=8B=E8=AF=951?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/读取文件/按页读取pdf.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py
index 8b3b10f..3f12648 100644
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@@ -47,7 +47,7 @@ def extract_text_by_page(file_path):
         return result
 
 
-def process_pages(get_text, total_pages):
+def process_pages(get_text, common_header, total_pages):
     """
     扫描所有页面，返回第一个和最后一个包含有效文本的页面索引。
     如果所有页面都为空，则返回 0 到 total_pages - 1。
@@ -57,6 +57,7 @@ def process_pages(get_text, total_pages):
     for page_num in range(total_pages):
         try:
             text = get_text(page_num)
+            cleaned_text = clean_page_content(text, common_header)
         except Exception as e:
             print(f"读取第 {page_num} 页失败: {e}")
             continue
@@ -75,12 +76,13 @@ def process_pages(get_text, total_pages):
     return start_page, end_page
 
 def read_pdf_main(pdf_path):
+    common_header=extract_common_header(pdf_path)
     try:
         with open(pdf_path, "rb") as f:
             pdf_document = PdfReader(f)
             total_pages = len(pdf_document.pages)
             get_text = create_get_text_function('pypdf2', pdf_document)
-            start_page, end_page = process_pages(get_text, total_pages)
+            start_page, end_page = process_pages(get_text, common_header,total_pages)
             return start_page, end_page
     except Exception as e_pypdf2:
         print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
@@ -90,7 +92,7 @@ def read_pdf_main(pdf_path):
         with fitz.open(pdf_path) as pdf_document:
             total_pages = pdf_document.page_count
             get_text = create_get_text_function('fitz', pdf_document)
-            start_page, end_page = process_pages(get_text, total_pages)
+            start_page, end_page = process_pages(get_text, common_header,total_pages)
         return start_page, end_page
     except Exception as e_pypdf2:
         print(f"extract_pages_generic: 使用 fitz 读取 PDF 失败: {e_pypdf2}")

From 043a4dd950ba21ba39dde424c2bf39204eee666b Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Mon, 17 Feb 2025 11:55:03 +0800
Subject: [PATCH 2/3] =?UTF-8?q?2.17=20=E5=A2=9E=E5=8A=A0=E8=AF=BB=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6pdf=E6=8E=A5=E5=8F=A3=E6=B5=8B=E8=AF=951?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/读取文件/按页读取pdf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py
index 3f12648..5842f8c 100644
--- a/flask_app/general/读取文件/按页读取pdf.py
+++ b/flask_app/general/读取文件/按页读取pdf.py
@@ -76,7 +76,8 @@ def process_pages(get_text, common_header, total_pages):
     return start_page, end_page
 
 def read_pdf_main(pdf_path):
-    common_header=extract_common_header(pdf_path)
+    # common_header=extract_common_header(pdf_path)
+    common_header=""
     try:
         with open(pdf_path, "rb") as f:
             pdf_document = PdfReader(f)

From 3a9bafc63f381f67f744337f59f02084d8c4ed73 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Mon, 17 Feb 2025 12:34:47 +0800
Subject: [PATCH 3/3] =?UTF-8?q?2.17=20=E5=A2=9E=E5=8A=A0=E8=AF=BB=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6pdf=E6=8E=A5=E5=8F=A3=E6=B5=8B=E8=AF=951?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/routes/test_preprocess.py | 7 ++++---
 flask_app/routes/test_readpdf.py    | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/flask_app/routes/test_preprocess.py b/flask_app/routes/test_preprocess.py
index 45038ae..04c8990 100644
--- a/flask_app/routes/test_preprocess.py
+++ b/flask_app/routes/test_preprocess.py
@@ -1,3 +1,5 @@
+import os
+
 from flask import request, jsonify, Blueprint, g
 import uuid
 import time
@@ -23,10 +25,9 @@ def process_file():
             return jsonify({'error': 'Missing file_url parameter'})
 
         # 生成唯一文件名
-        file_ext = '.pdf'
-        filename = f"{uuid.uuid4().hex}{file_ext}"
+        filename = os.path.join(output_folder,'ztbfile.pdf')
         file_path,file_type=download_file(file_url, filename)
-        print(file_path)
+        # print(file_path)
         # 调用预处理函数
         start_time = time.time()
         result = preprocess_files(
diff --git a/flask_app/routes/test_readpdf.py b/flask_app/routes/test_readpdf.py
index 7d130ed..4110bd7 100644
--- a/flask_app/routes/test_readpdf.py
+++ b/flask_app/routes/test_readpdf.py
@@ -1,3 +1,5 @@
+import os.path
+
 from flask import request, jsonify, Blueprint, g
 import uuid
 import time
@@ -24,8 +26,7 @@ def process_file():
             return jsonify({'error': 'Missing file_url parameter'})
 
         # 生成唯一文件名
-        file_ext = '.pdf'
-        filename = f"{uuid.uuid4().hex}{file_ext}"
+        filename = os.path.join(output_folder,'ztbfile.pdf')
         file_path,file_type=download_file(file_url, filename)
         # print(file_path)
         # 调用预处理函数