Merge branch 'develop-test' into develop-2.16
This commit is contained in:
commit
f6da90d230
@ -47,7 +47,7 @@ def extract_text_by_page(file_path):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def process_pages(get_text, total_pages):
|
def process_pages(get_text, common_header, total_pages):
|
||||||
"""
|
"""
|
||||||
扫描所有页面,返回第一个和最后一个包含有效文本的页面索引。
|
扫描所有页面,返回第一个和最后一个包含有效文本的页面索引。
|
||||||
如果所有页面都为空,则返回 0 到 total_pages - 1。
|
如果所有页面都为空,则返回 0 到 total_pages - 1。
|
||||||
@ -57,6 +57,7 @@ def process_pages(get_text, total_pages):
|
|||||||
for page_num in range(total_pages):
|
for page_num in range(total_pages):
|
||||||
try:
|
try:
|
||||||
text = get_text(page_num)
|
text = get_text(page_num)
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"读取第 {page_num} 页失败: {e}")
|
print(f"读取第 {page_num} 页失败: {e}")
|
||||||
continue
|
continue
|
||||||
@ -75,12 +76,14 @@ def process_pages(get_text, total_pages):
|
|||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
def read_pdf_main(pdf_path):
|
def read_pdf_main(pdf_path):
|
||||||
|
# common_header=extract_common_header(pdf_path)
|
||||||
|
common_header=""
|
||||||
try:
|
try:
|
||||||
with open(pdf_path, "rb") as f:
|
with open(pdf_path, "rb") as f:
|
||||||
pdf_document = PdfReader(f)
|
pdf_document = PdfReader(f)
|
||||||
total_pages = len(pdf_document.pages)
|
total_pages = len(pdf_document.pages)
|
||||||
get_text = create_get_text_function('pypdf2', pdf_document)
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||||
start_page, end_page = process_pages(get_text, total_pages)
|
start_page, end_page = process_pages(get_text, common_header,total_pages)
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
except Exception as e_pypdf2:
|
except Exception as e_pypdf2:
|
||||||
print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
@ -90,7 +93,7 @@ def read_pdf_main(pdf_path):
|
|||||||
with fitz.open(pdf_path) as pdf_document:
|
with fitz.open(pdf_path) as pdf_document:
|
||||||
total_pages = pdf_document.page_count
|
total_pages = pdf_document.page_count
|
||||||
get_text = create_get_text_function('fitz', pdf_document)
|
get_text = create_get_text_function('fitz', pdf_document)
|
||||||
start_page, end_page = process_pages(get_text, total_pages)
|
start_page, end_page = process_pages(get_text, common_header,total_pages)
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
except Exception as e_pypdf2:
|
except Exception as e_pypdf2:
|
||||||
print(f"extract_pages_generic: 使用 fitz 读取 PDF 失败: {e_pypdf2}")
|
print(f"extract_pages_generic: 使用 fitz 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
from flask import request, jsonify, Blueprint, g
|
from flask import request, jsonify, Blueprint, g
|
||||||
import uuid
|
import uuid
|
||||||
import time
|
import time
|
||||||
@ -23,10 +25,9 @@ def process_file():
|
|||||||
return jsonify({'error': 'Missing file_url parameter'})
|
return jsonify({'error': 'Missing file_url parameter'})
|
||||||
|
|
||||||
# 生成唯一文件名
|
# 生成唯一文件名
|
||||||
file_ext = '.pdf'
|
filename = os.path.join(output_folder,'ztbfile.pdf')
|
||||||
filename = f"{uuid.uuid4().hex}{file_ext}"
|
|
||||||
file_path,file_type=download_file(file_url, filename)
|
file_path,file_type=download_file(file_url, filename)
|
||||||
print(file_path)
|
# print(file_path)
|
||||||
# 调用预处理函数
|
# 调用预处理函数
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
result = preprocess_files(
|
result = preprocess_files(
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import os.path
|
||||||
|
|
||||||
from flask import request, jsonify, Blueprint, g
|
from flask import request, jsonify, Blueprint, g
|
||||||
import uuid
|
import uuid
|
||||||
import time
|
import time
|
||||||
@ -24,8 +26,7 @@ def process_file():
|
|||||||
return jsonify({'error': 'Missing file_url parameter'})
|
return jsonify({'error': 'Missing file_url parameter'})
|
||||||
|
|
||||||
# 生成唯一文件名
|
# 生成唯一文件名
|
||||||
file_ext = '.pdf'
|
filename = os.path.join(output_folder,'ztbfile.pdf')
|
||||||
filename = f"{uuid.uuid4().hex}{file_ext}"
|
|
||||||
file_path,file_type=download_file(file_url, filename)
|
file_path,file_type=download_file(file_url, filename)
|
||||||
# print(file_path)
|
# print(file_path)
|
||||||
# 调用预处理函数
|
# 调用预处理函数
|
||||||
|
Loading…
x
Reference in New Issue
Block a user