2.14 尝试解决内存泄漏问题
This commit is contained in:
parent
cef8ff5415
commit
979d7fac21
@ -9,12 +9,10 @@ from flask_app.general.读取文件.clean_pdf import is_scanned_pdf, is_pure_ima
|
|||||||
def download_file(url, local_filename,enable=False):
|
def download_file(url, local_filename,enable=False):
|
||||||
"""
|
"""
|
||||||
下载文件并保存到本地,基于Content-Type设置文件扩展名。
|
下载文件并保存到本地,基于Content-Type设置文件扩展名。
|
||||||
|
|
||||||
参数:
|
参数:
|
||||||
- url (str): 文件的URL地址。
|
- url (str): 文件的URL地址。
|
||||||
- local_filename (str): 本地保存的文件名(不含扩展名)。
|
- local_filename (str): 本地保存的文件名(不含扩展名)。
|
||||||
- enable: 是否需要判断为扫描型/纯图片文件
|
- enable: 是否需要判断为扫描型/纯图片文件
|
||||||
|
|
||||||
返回:
|
返回:
|
||||||
- tuple: (完整文件名, 文件类型代码)
|
- tuple: (完整文件名, 文件类型代码)
|
||||||
文件类型代码:
|
文件类型代码:
|
||||||
@ -31,10 +29,10 @@ def download_file(url, local_filename,enable=False):
|
|||||||
# 获取Content-Type并猜测文件扩展名
|
# 获取Content-Type并猜测文件扩展名
|
||||||
content_type = response.headers.get('Content-Type', '')
|
content_type = response.headers.get('Content-Type', '')
|
||||||
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
|
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
|
||||||
|
extension_lower = extension.lower() #显式地将 extension 转换为小写(存入 extension_lower),更加健壮
|
||||||
# 分离文件名和现有扩展名
|
# 根据传入的 local_filename 构造完整的文件名
|
||||||
base, ext = os.path.splitext(local_filename)
|
base, current_ext = os.path.splitext(local_filename)
|
||||||
if ext.lower() != extension:
|
if current_ext.lower() != extension_lower:
|
||||||
full_filename = base + extension
|
full_filename = base + extension
|
||||||
else:
|
else:
|
||||||
full_filename = local_filename
|
full_filename = local_filename
|
||||||
@ -94,6 +92,7 @@ def local_file_2_url(file_path, url):
|
|||||||
print("format_change 文件上传成功")
|
print("format_change 文件上传成功")
|
||||||
receive_file_response = response.content.decode('utf-8')
|
receive_file_response = response.content.decode('utf-8')
|
||||||
receive_file_json = json.loads(receive_file_response)
|
receive_file_json = json.loads(receive_file_response)
|
||||||
|
print(receive_file_json)
|
||||||
receive_file_url = receive_file_json["data"]
|
receive_file_url = receive_file_json["data"]
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -207,17 +206,17 @@ def get_pdf_page_count(file_path):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 替换为你的文件路径和API URL
|
# 替换为你的文件路径和API URL
|
||||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
|
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
|
||||||
# local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
|
local_path_in = r"C:\Users\Administrator\Downloads\基础公司枞阳经开区新能源汽车零部件产业园基础设施建设项目二期桩基工程劳务分包.doc"
|
||||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
||||||
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
||||||
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
|
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
|
||||||
local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
|
# local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
|
||||||
intermediate_docx = pdf2docx(local_path_in)
|
# intermediate_docx = pdf2docx(local_path_in)
|
||||||
if intermediate_docx:
|
# if intermediate_docx:
|
||||||
normal_pdf = docx2pdf(intermediate_docx, force=True)
|
# normal_pdf = docx2pdf(intermediate_docx, force=True)
|
||||||
# # downloaded_file=pdf2docx(local_path_in)
|
# # downloaded_file=pdf2docx(local_path_in)
|
||||||
# downloaded_file=docx2pdf(local_path_in)
|
downloaded_file=docx2pdf(local_path_in)
|
||||||
# print(downloaded_file)
|
print(downloaded_file)
|
||||||
|
|
||||||
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
|
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
|
||||||
# local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
|
# local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
|
||||||
|
@ -23,36 +23,32 @@ def insert_mark(input_pdf_path):
|
|||||||
|
|
||||||
# 遍历每一页
|
# 遍历每一页
|
||||||
for page_num in range(total_pages):
|
for page_num in range(total_pages):
|
||||||
|
# 添加原始页面
|
||||||
page = pdf_reader.pages[page_num]
|
page = pdf_reader.pages[page_num]
|
||||||
pdf_writer.add_page(page)
|
pdf_writer.add_page(page)
|
||||||
|
|
||||||
# 创建一个内存中的PDF,用于存放带有文本的空白页
|
# 使用 BytesIO 的上下文管理器创建内存中的 PDF 页面用于标记
|
||||||
packet = BytesIO()
|
with BytesIO() as packet:
|
||||||
# 获取当前页面的宽度和高度
|
# 获取当前页面的宽度和高度
|
||||||
page_width = float(page.mediabox.width)
|
page_width = float(page.mediabox.width)
|
||||||
page_height = float(page.mediabox.height)
|
page_height = float(page.mediabox.height)
|
||||||
# 使用reportlab创建一个新的PDF页面
|
# 使用 reportlab 创建一个新的 PDF 页面,页面大小与当前页面相同
|
||||||
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
|
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
|
||||||
|
|
||||||
# 计算文本的位置(单位:点,1厘米 ≈ 28.35点)
|
# 计算文本位置:距左边距 2.3cm,距顶部 0.5cm
|
||||||
x_position = 2.3 * cm
|
x_position = 2.3 * cm
|
||||||
y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距
|
y_position = page_height - 0.5 * cm
|
||||||
|
c.setFont("Helvetica", 12)
|
||||||
# 绘制文本,使用 (page_num + 1) 作为索引
|
|
||||||
c.setFont("Helvetica", 12) # 设置字体和大小
|
|
||||||
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
|
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
|
||||||
|
|
||||||
# 完成绘制
|
# 完成绘制,将内容写入 BytesIO 流
|
||||||
c.save()
|
c.save()
|
||||||
|
|
||||||
# 将内存中的PDF读入PyPDF2
|
# 重置流位置,读取刚刚生成的 PDF 页面
|
||||||
packet.seek(0)
|
packet.seek(0)
|
||||||
new_pdf = PdfReader(packet)
|
new_pdf = PdfReader(packet)
|
||||||
# blank_page = new_pdf.pages[0]
|
# 使用 deepcopy 确保页面对象独立,避免后续修改影响已添加页面
|
||||||
blank_page = copy.deepcopy(new_pdf.pages[0])
|
blank_page = copy.deepcopy(new_pdf.pages[0])
|
||||||
packet.truncate(0)
|
|
||||||
packet.seek(0)
|
|
||||||
# 将带有文本的空白页添加到写入器
|
|
||||||
pdf_writer.add_page(blank_page)
|
pdf_writer.add_page(blank_page)
|
||||||
|
|
||||||
# 将所有页面写入输出的PDF文件
|
# 将所有页面写入输出的PDF文件
|
||||||
@ -65,10 +61,14 @@ def insert_mark(input_pdf_path):
|
|||||||
print(f"发生错误: {e}")
|
print(f"发生错误: {e}")
|
||||||
return input_pdf_path
|
return input_pdf_path
|
||||||
|
|
||||||
|
|
||||||
def delete_mark(docx_path):
|
def delete_mark(docx_path):
|
||||||
|
"""
|
||||||
|
删除文档中包含标记文本 "[$$index_mark_" 的段落,
|
||||||
|
如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落,也一并删除。
|
||||||
|
修改后的文档保存为 invalid_del.docx,并返回新文件路径。
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
docx = Document(docx_path)
|
doc = Document(docx_path)
|
||||||
except KeyError as e:
|
except KeyError as e:
|
||||||
print(f"Error opening document: {e}")
|
print(f"Error opening document: {e}")
|
||||||
return ""
|
return ""
|
||||||
@ -76,30 +76,41 @@ def delete_mark(docx_path):
|
|||||||
print(f"Invalid package: {e}")
|
print(f"Invalid package: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# 继续处理文档
|
# 收集所有需要删除的段落,避免在遍历时直接修改列表
|
||||||
find_flag = False
|
paragraphs_to_remove = []
|
||||||
for para in docx.paragraphs:
|
paragraphs = doc.paragraphs
|
||||||
# 匹配标记: [$$index_mark_X$$]
|
i = 0
|
||||||
|
while i < len(paragraphs):
|
||||||
|
para = paragraphs[i]
|
||||||
if "[$$index_mark_" in para.text:
|
if "[$$index_mark_" in para.text:
|
||||||
para._element.getparent().remove(para._element) # 删标记
|
paragraphs_to_remove.append(para)
|
||||||
find_flag = True
|
# 如果当前段落后面有段落且该段落的 XML 中包含 "w:sectPr",也删除该段落
|
||||||
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
|
if i + 1 < len(paragraphs):
|
||||||
para._element.getparent().remove(para._element)
|
next_para = paragraphs[i + 1]
|
||||||
find_flag = False
|
if "w:sectPr" in next_para._element.xml:
|
||||||
dir_path = os.path.dirname(docx_path)
|
paragraphs_to_remove.append(next_para)
|
||||||
new_file_path = os.path.join(dir_path, 'invalid_del.docx')
|
i += 1 # 跳过下一个段落,避免重复处理
|
||||||
|
i += 1
|
||||||
|
|
||||||
# 保存修改后的文档
|
# 从文档 XML 中移除收集到的段落
|
||||||
docx.save(new_file_path)
|
for para in paragraphs_to_remove:
|
||||||
|
parent = para._element.getparent()
|
||||||
|
if parent is not None:
|
||||||
|
parent.remove(para._element)
|
||||||
|
|
||||||
|
new_file_path = os.path.join(os.path.dirname(docx_path), 'invalid_del.docx')
|
||||||
|
doc.save(new_file_path)
|
||||||
return new_file_path
|
return new_file_path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
|
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
|
||||||
input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\ztbfile_invalid.pdf'
|
# input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf'
|
||||||
output=insert_mark(input)
|
# output=insert_mark(input)
|
||||||
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
|
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\invalid_added.docx'
|
||||||
# res=delete_mark(doc_path)
|
res=delete_mark(doc_path)
|
||||||
# if res:
|
if res:
|
||||||
# print(res)
|
print(res)
|
||||||
# else:
|
else:
|
||||||
# print("No")
|
print("No")
|
@ -6,9 +6,6 @@ import time
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from flask_app.general.llm.大模型通用函数 import get_total_tokens
|
|
||||||
from flask_app.general.llm.qianwen_turbo import qianwen_turbo
|
|
||||||
from flask_app.general.llm.大模型通用函数 import extract_error_details, shared_rate_limit
|
from flask_app.general.llm.大模型通用函数 import extract_error_details, shared_rate_limit
|
||||||
|
|
||||||
file_write_lock = threading.Lock()
|
file_write_lock = threading.Lock()
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
|
|
||||||
from flask_app.general.截取pdf通用函数 import create_get_text_function
|
from flask_app.general.截取pdf通用函数 import create_get_text_function
|
||||||
|
|
||||||
|
|
||||||
@ -67,6 +65,7 @@ def merge_with_fitz(paths, output_path):
|
|||||||
for path in paths:
|
for path in paths:
|
||||||
if not path.strip():
|
if not path.strip():
|
||||||
continue
|
continue
|
||||||
|
pdf_doc = None
|
||||||
try:
|
try:
|
||||||
pdf_doc = fitz.open(path)
|
pdf_doc = fitz.open(path)
|
||||||
get_text = create_get_text_function('fitz', pdf_doc)
|
get_text = create_get_text_function('fitz', pdf_doc)
|
||||||
@ -88,7 +87,9 @@ def merge_with_fitz(paths, output_path):
|
|||||||
pdf_doc.close()
|
pdf_doc.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"文件 '{path}' 无法使用 fitz 处理,错误: {e}")
|
print(f"文件 '{path}' 无法使用 fitz 处理,错误: {e}")
|
||||||
continue
|
finally:
|
||||||
|
if pdf_doc is not None:
|
||||||
|
pdf_doc.close()
|
||||||
|
|
||||||
if merged_pdf.page_count == 0:
|
if merged_pdf.page_count == 0:
|
||||||
merged_pdf.close()
|
merged_pdf.close()
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
# flask_app/general/截取pdf通用函数.py
|
# flask_app/general/截取pdf通用函数.py
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
|
import io
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
@ -37,8 +38,10 @@ def get_start_and_common_header(input_path, end_page):
|
|||||||
- common_header: 公共页眉内容。
|
- common_header: 公共页眉内容。
|
||||||
- last_begin_index: 起始页的页码索引。
|
- last_begin_index: 起始页的页码索引。
|
||||||
"""
|
"""
|
||||||
|
pdf_document = None
|
||||||
try:
|
try:
|
||||||
# 提取公共页眉
|
# 提取公共页眉
|
||||||
|
|
||||||
common_header = extract_common_header(input_path) # 假设该函数已定义
|
common_header = extract_common_header(input_path) # 假设该函数已定义
|
||||||
|
|
||||||
last_begin_index = -1
|
last_begin_index = -1
|
||||||
@ -48,7 +51,6 @@ def get_start_and_common_header(input_path, end_page):
|
|||||||
)
|
)
|
||||||
# 新增目录匹配模式
|
# 新增目录匹配模式
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 尝试使用 PyPDF2 读取 PDF
|
# 尝试使用 PyPDF2 读取 PDF
|
||||||
pdf_document = PdfReader(input_path)
|
pdf_document = PdfReader(input_path)
|
||||||
@ -86,58 +88,77 @@ def get_start_and_common_header(input_path, end_page):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in get_start_and_common_header: {e}")
|
print(f"Error in get_start_and_common_header: {e}")
|
||||||
return "", -1 # 根据需求调整返回值
|
return "", -1 # 根据需求调整返回值
|
||||||
|
finally:
|
||||||
|
# 如果使用的是 PyMuPDF,记得关闭文档
|
||||||
|
if pdf_document and hasattr(pdf_document, "close"):
|
||||||
|
pdf_document.close()
|
||||||
|
|
||||||
|
|
||||||
def check_pdf_pages(pdf_path, mode, logger):
|
def check_pdf_pages(pdf_path, mode, logger):
|
||||||
|
pdf_document = None
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
|
# 尝试使用 PyPDF2 读取 PDF(新版可直接传入文件路径)
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
num_pages = len(pdf_document.pages)
|
num_pages = len(pdf_document.pages)
|
||||||
# logger.info(f"使用 PyPDF2 成功读取 PDF '{pdf_path}' 的总页数为: {num_pages}")
|
# logger.info(f"使用 PyPDF2 成功读取 PDF '{pdf_path}' 的总页数为: {num_pages}")
|
||||||
except Exception as e_pypdf2:
|
except Exception as e_pypdf2:
|
||||||
print(f"check_pdf_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
# 如果 PyPDF2 读取失败,尝试使用 PyMuPDF 读取 PDF
|
||||||
pdf_document = fitz.open(pdf_path)
|
pdf_document = fitz.open(pdf_path)
|
||||||
num_pages = pdf_document.page_count
|
num_pages = pdf_document.page_count
|
||||||
|
|
||||||
if num_pages <= 30:
|
if num_pages <= 30:
|
||||||
logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。")
|
logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。")
|
||||||
if mode == 'goods':
|
if mode == 'goods':
|
||||||
return True, ['', '', '', '', '', '', pdf_path, '']
|
return True, ['', '', '', '', '', '', pdf_path, '']
|
||||||
else:
|
else:
|
||||||
return True, ['', '', '', '', '', pdf_path, '']
|
return True, ['', '', '', '', '', pdf_path, '']
|
||||||
# 若页数大于30页,返回None表示继续处理
|
|
||||||
|
# 若页数大于30页,返回 False 表示继续处理
|
||||||
return False, []
|
return False, []
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"无法读取 PDF 页数: {e}")
|
logger.error(f"无法读取 PDF 页数: {e}")
|
||||||
# 返回空列表意味着无法执行后续处理逻辑
|
|
||||||
if mode == 'goods':
|
if mode == 'goods':
|
||||||
return True, ['', '', '', '', '', '', pdf_path, '']
|
return True, ['', '', '', '', '', '', pdf_path, '']
|
||||||
else:
|
else:
|
||||||
return True, ['', '', '', '', '', pdf_path, '']
|
return True, ['', '', '', '', '', pdf_path, '']
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# 如果使用的是 PyMuPDF 或其他需要关闭的对象,确保关闭资源
|
||||||
|
if pdf_document and hasattr(pdf_document, "close"):
|
||||||
|
pdf_document.close()
|
||||||
|
|
||||||
|
|
||||||
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
|
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
|
||||||
|
pdf_document = None
|
||||||
|
using_pypdf2 = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if start_page is None or end_page is None:
|
if start_page is None or end_page is None:
|
||||||
print("错误: start_page 或 end_page 为 None")
|
print("错误: start_page 或 end_page 为 None")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# 尝试使用 PyPDF2 读取 PDF
|
# 尝试使用 PyPDF2 读取 PDF(新版可以直接传入路径)
|
||||||
try:
|
try:
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
total_pages = len(pdf_document.pages)
|
total_pages = len(pdf_document.pages)
|
||||||
get_text = lambda p: pdf_document.pages[p].extract_text()
|
# 定义提取文本函数
|
||||||
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||||
using_pypdf2 = True
|
using_pypdf2 = True
|
||||||
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
||||||
except Exception as e_pypdf2:
|
except Exception as e_pypdf2:
|
||||||
print(f"save_extracted_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
print(f"save_extracted_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
# 尝试使用 PyMuPDF 读取 PDF
|
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
|
||||||
try:
|
try:
|
||||||
pdf_document = fitz.open(pdf_path)
|
pdf_document = fitz.open(pdf_path)
|
||||||
total_pages = pdf_document.page_count
|
total_pages = pdf_document.page_count
|
||||||
get_text = lambda p: pdf_document.load_page(p).get_text()
|
get_text = create_get_text_function('fitz', pdf_document)
|
||||||
using_pypdf2 = False
|
using_pypdf2 = False
|
||||||
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
except Exception as e_fitz:
|
except Exception as e_fitz:
|
||||||
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
print(f"save_extracted_pages: 使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# 检查页面范围的有效性
|
# 检查页面范围的有效性
|
||||||
@ -148,6 +169,7 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
|
|||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
||||||
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
||||||
|
|
||||||
|
# 如果 output_suffix 为 'notice' 并且 start_page > 0,则另存“目录前”的页面
|
||||||
if output_suffix == 'notice' and start_page > 0:
|
if output_suffix == 'notice' and start_page > 0:
|
||||||
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
|
||||||
toc_page = -1
|
toc_page = -1
|
||||||
@ -158,6 +180,7 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
|
|||||||
except Exception as e_extract:
|
except Exception as e_extract:
|
||||||
print(f"提取第 {page_num} 页文本时出错: {e_extract}")
|
print(f"提取第 {page_num} 页文本时出错: {e_extract}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
|
if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
|
||||||
toc_page = page_num
|
toc_page = page_num
|
||||||
@ -204,8 +227,8 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
|
|||||||
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
|
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
with open(output_pdf_path, 'wb') as f:
|
with open(output_pdf_path, 'wb') as f_out:
|
||||||
output_doc.write(f)
|
output_doc.write(f_out)
|
||||||
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||||||
return output_pdf_path
|
return output_pdf_path
|
||||||
except Exception as e_write:
|
except Exception as e_write:
|
||||||
@ -213,24 +236,28 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
|
|||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
output_doc = fitz.open()
|
output_doc = fitz.open()
|
||||||
|
try:
|
||||||
for page_num in range(start_page, end_page + 1):
|
for page_num in range(start_page, end_page + 1):
|
||||||
try:
|
try:
|
||||||
output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
|
output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
|
||||||
except Exception as e_page:
|
except Exception as e_page:
|
||||||
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
|
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
|
||||||
continue
|
continue
|
||||||
try:
|
|
||||||
output_doc.save(output_pdf_path)
|
output_doc.save(output_pdf_path)
|
||||||
output_doc.close()
|
|
||||||
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
|
||||||
return output_pdf_path
|
return output_pdf_path
|
||||||
except Exception as e_write:
|
except Exception as e_write:
|
||||||
print(f"保存截取的页面时出错: {e_write}")
|
print(f"保存截取的页面时出错: {e_write}")
|
||||||
return ""
|
return ""
|
||||||
|
finally:
|
||||||
|
output_doc.close()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"发生未知错误: {e}")
|
print(f"发生未知错误: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
finally:
|
||||||
|
# 如果使用的是 PyMuPDF 或其他需要关闭的对象,确保关闭资源
|
||||||
|
if pdf_document and not using_pypdf2 and hasattr(pdf_document, "close"):
|
||||||
|
pdf_document.close()
|
||||||
|
|
||||||
def is_pdf_or_doc(filename):
|
def is_pdf_or_doc(filename):
|
||||||
# 判断文件是否为PDF或Word文档
|
# 判断文件是否为PDF或Word文档
|
||||||
@ -242,14 +269,14 @@ def convert_to_pdf(file_path):
|
|||||||
return docx2pdf(file_path)
|
return docx2pdf(file_path)
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
def get_invalid_file(pdf_path, output_folder, common_header, begin_page):
|
||||||
"""
|
"""
|
||||||
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
从指定的PDF文件中提取无效部分并保存到输出文件夹中。
|
||||||
begin_pattern匹配开头 end_pattern匹配结尾
|
begin_pattern匹配开头 end_pattern匹配结尾
|
||||||
页数小于100不进行begin_pattern,默认start_page=0
|
页数小于100不进行begin_pattern,默认start_page=0
|
||||||
页数大于200时end_pattern从前往后匹配
|
页数大于200时end_pattern从前往后匹配
|
||||||
Args:
|
Args:
|
||||||
file_path (str): 输入的PDF文件路径。
|
pdf_path (str): 输入的PDF文件路径。
|
||||||
output_folder (str): 提取后文件的输出文件夹路径。
|
output_folder (str): 提取后文件的输出文件夹路径。
|
||||||
common_header (str): 公共头部文本,用于清理每页的内容。
|
common_header (str): 公共头部文本,用于清理每页的内容。
|
||||||
|
|
||||||
@ -284,23 +311,22 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
|
|
||||||
|
pdf_document = None
|
||||||
try:
|
try:
|
||||||
# 打开PDF文件
|
# 尝试使用 PyPDF2 读取 PDF
|
||||||
try:
|
try:
|
||||||
pdf_document = PdfReader(file_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
pdf_lib = 'pypdf2'
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||||
# 获取总页数(对于 pypdf2 的 PDFDocument)
|
|
||||||
total_pages = len(pdf_document.pages)
|
total_pages = len(pdf_document.pages)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"get_invalid_file:使用 pypdf2 读取失败,错误信息: {e},切换至 fitz。")
|
pdf_document = fitz.open(pdf_path)
|
||||||
pdf_document = fitz.open(file_path)
|
get_text = create_get_text_function('fitz', pdf_document)
|
||||||
pdf_lib = 'fitz'
|
|
||||||
# 获取总页数(对于 fitz 的 PDFDocument)
|
|
||||||
total_pages = pdf_document.page_count
|
total_pages = pdf_document.page_count
|
||||||
|
|
||||||
if total_pages <= 50:
|
if total_pages <= 50:
|
||||||
print(f"PDF页数({total_pages}) <= 50,直接返回文件路径:{file_path}")
|
print(f"PDF页数({total_pages}) <= 50,直接返回文件路径: {pdf_path}")
|
||||||
return file_path, 0
|
return pdf_path, 0
|
||||||
get_text = create_get_text_function(pdf_lib, pdf_document)
|
|
||||||
# 提取并清理每页的文本内容
|
# 提取并清理每页的文本内容
|
||||||
page_texts = []
|
page_texts = []
|
||||||
for i in range(total_pages):
|
for i in range(total_pages):
|
||||||
@ -308,11 +334,12 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
cleaned_text = clean_page_content(text, common_header) if text else ""
|
cleaned_text = clean_page_content(text, common_header) if text else ""
|
||||||
page_texts.append(cleaned_text)
|
page_texts.append(cleaned_text)
|
||||||
|
|
||||||
# 定义查找起始页的函数
|
# 内部函数:查找起始页
|
||||||
def find_start_page(begin_page):
|
def find_start_page(begin_page):
|
||||||
for pattern in begin_patterns:
|
for pattern in begin_patterns: # 需提前定义好
|
||||||
for i in range(begin_page, min(begin_page + 30, total_pages)):
|
for i in range(begin_page, min(begin_page + 30, total_pages)):
|
||||||
text = page_texts[i]
|
text = page_texts[i]
|
||||||
|
# 若存在排除模式则跳过
|
||||||
if regex.search(exclusion_pattern, text):
|
if regex.search(exclusion_pattern, text):
|
||||||
continue
|
continue
|
||||||
if regex.search(pattern, text):
|
if regex.search(pattern, text):
|
||||||
@ -321,22 +348,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
# print(f"未在前30页找到模式: {pattern.pattern}")
|
# print(f"未在前30页找到模式: {pattern.pattern}")
|
||||||
return 0 # 默认从第一页开始
|
return 0 # 默认从第一页开始
|
||||||
|
|
||||||
# 定义查找结束页的函数
|
# 内部函数:查找结束页
|
||||||
def find_end_page():
|
def find_end_page():
|
||||||
if total_pages > 200:
|
if total_pages > 200:
|
||||||
# print("总页数大于200页,结束页从前往后查找,跳过前30页。") #且先匹配合同
|
# 当页数较多时,从第31页开始正向查找结束模式
|
||||||
for pattern in end_patterns:
|
for pattern in end_patterns: # 需提前定义好
|
||||||
for i in range(30, total_pages):
|
for i in range(30, total_pages):
|
||||||
text = page_texts[i]
|
text = page_texts[i]
|
||||||
exclusion_pattern = regex.compile(
|
# 定义局部的排除模式
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$', regex.MULTILINE)
|
exclusion_pat = regex.compile(
|
||||||
if regex.search(exclusion_pattern,text):
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$',
|
||||||
|
regex.MULTILINE
|
||||||
|
)
|
||||||
|
if regex.search(exclusion_pat, text):
|
||||||
continue
|
continue
|
||||||
if regex.search(pattern, text):
|
if regex.search(pattern, text):
|
||||||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||||||
return i
|
return i
|
||||||
else:
|
else:
|
||||||
# print("结束页从后往前查找,确保只剩前30页时停止。")
|
# 当页数较少时,从后向前查找(至少保留前30页)
|
||||||
for pattern in end_patterns:
|
for pattern in end_patterns:
|
||||||
for i in range(total_pages - 1, 29, -1):
|
for i in range(total_pages - 1, 29, -1):
|
||||||
text = page_texts[i]
|
text = page_texts[i]
|
||||||
@ -344,77 +374,76 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
|
|||||||
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
|
||||||
return i
|
return i
|
||||||
|
|
||||||
# 如果没有匹配到,设置end_page逻辑
|
# 若未匹配到结束模式,设置默认结束页
|
||||||
if total_pages > 200:
|
if total_pages > 200:
|
||||||
print(f"未找到结束模式,总页数大于200,平滑调整结束页为 {end_page}。" + file_path)
|
print(f"未找到结束模式,总页数大于200,平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}。{pdf_path}")
|
||||||
return max(100, int(total_pages * 1 / 2))
|
return max(100, int(total_pages * 1 / 2))
|
||||||
elif 100 < total_pages <= 200:
|
elif 100 < total_pages <= 200:
|
||||||
print("未找到结束模式,总页数在100到200之间,设置结束页为总页数的三分之二。" + file_path)
|
print("未找到结束模式,总页数在100到200之间,设置结束页为总页数的三分之二。 " + pdf_path)
|
||||||
return max(100, int(total_pages * 2 / 3))
|
return max(100, int(total_pages * 2 / 3))
|
||||||
else:
|
else:
|
||||||
print("未找到结束模式,设置结束页为最后一页。" + file_path)
|
print("未找到结束模式,设置结束页为最后一页。 " + pdf_path)
|
||||||
return total_pages - 1
|
return total_pages - 1
|
||||||
|
|
||||||
# 根据总页数决定是否查找起始页
|
# 根据总页数决定是否并发查找起始页与结束页
|
||||||
if total_pages < 100:
|
if total_pages < 100:
|
||||||
# print("总页数少于100页,默认起始页为第一页。")
|
start_page_found = 0
|
||||||
start_page = 0
|
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
future_end = executor.submit(find_end_page)
|
future_end = executor.submit(find_end_page)
|
||||||
end_page = future_end.result()
|
end_page_found = future_end.result()
|
||||||
else:
|
else:
|
||||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||||
future_start = executor.submit(find_start_page, begin_page)
|
future_start = executor.submit(find_start_page, begin_page)
|
||||||
future_end = executor.submit(find_end_page)
|
future_end = executor.submit(find_end_page)
|
||||||
start_page = future_start.result()
|
start_page_found = future_start.result()
|
||||||
end_page = future_end.result()
|
end_page_found = future_end.result()
|
||||||
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
|
|
||||||
# 验证页码范围
|
# 检查页码范围是否有效
|
||||||
if start_page > end_page:
|
if start_page_found > end_page_found:
|
||||||
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
|
print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})")
|
||||||
return "", 0
|
return "", 0
|
||||||
|
|
||||||
# 调用已实现的保存函数
|
# 调用已实现的保存函数,保存提取的页面
|
||||||
output_path = save_extracted_pages(
|
output_path = save_extracted_pages(
|
||||||
pdf_path=file_path,
|
pdf_path=pdf_path,
|
||||||
output_folder=output_folder,
|
output_folder=output_folder,
|
||||||
start_page=start_page,
|
start_page=start_page_found,
|
||||||
end_page=end_page,
|
end_page=end_page_found,
|
||||||
output_suffix="invalid",
|
output_suffix="invalid",
|
||||||
common_header=common_header
|
common_header=common_header
|
||||||
)
|
)
|
||||||
|
return output_path, end_page_found
|
||||||
# print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
|
|
||||||
return output_path, end_page
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"处理文件 {file_path} 时发生错误: {e}")
|
print(f"处理文件 {pdf_path} 时发生错误: {e}")
|
||||||
return "", 0
|
return "", 0
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# 确保释放资源(对 fitz 的 pdf_document 需要关闭)
|
||||||
|
if pdf_document and hasattr(pdf_document, "close"):
|
||||||
|
pdf_document.close()
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
|
||||||
output_suffix="normal", invalid_endpage=-1):
|
output_suffix="normal", invalid_endpage=-1):
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
flag = True
|
flag = True
|
||||||
|
# 先尝试使用 PyPDF2
|
||||||
try:
|
try:
|
||||||
# 尝试使用 PyPDF2 读取 PDF
|
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
total_pages = len(pdf_document.pages)
|
total_pages = len(pdf_document.pages)
|
||||||
get_text = create_get_text_function('pypdf2', pdf_document)
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||||
# print("使用 PyPDF2 成功读取 PDF 文件。")
|
|
||||||
except Exception as e_pypdf2:
|
except Exception as e_pypdf2:
|
||||||
print(f"extract_pages_generic:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
print(f"extract_pages_generic:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
|
# 如果 PyPDF2 失败,就再试 PyMuPDF
|
||||||
try:
|
try:
|
||||||
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
|
|
||||||
pdf_document = fitz.open(pdf_path)
|
pdf_document = fitz.open(pdf_path)
|
||||||
total_pages = pdf_document.page_count
|
total_pages = pdf_document.page_count
|
||||||
get_text = create_get_text_function('fitz', pdf_document)
|
get_text = create_get_text_function('fitz', pdf_document)
|
||||||
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
|
||||||
except Exception as e_fitz:
|
except Exception as e_fitz:
|
||||||
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
|
||||||
return None, None # 或者根据需求抛出异常
|
return None, None
|
||||||
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
||||||
for i in range(begin_page, end_limit):
|
for i in range(begin_page, end_limit):
|
||||||
text = get_text(i)
|
text = get_text(i)
|
||||||
@ -447,7 +476,9 @@ def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, comm
|
|||||||
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
|
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
|
||||||
end_page = i
|
end_page = i
|
||||||
break
|
break
|
||||||
|
# 如果使用的是 PyMuPDF,需要手动关闭文档
|
||||||
|
if pdf_document and isinstance(pdf_document, fitz.Document):
|
||||||
|
pdf_document.close()
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
|
||||||
|
|
||||||
@ -534,7 +565,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
"""
|
"""
|
||||||
exclusion_pattern = regex.compile(
|
exclusion_pattern = regex.compile(
|
||||||
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
|
||||||
def run_extraction(pdf_document,begin_pattern, extraction_stage='first'):
|
def run_extraction(begin_pattern, extraction_stage='first'):
|
||||||
"""
|
"""
|
||||||
使用提供的 begin_pattern 运行提取过程。
|
使用提供的 begin_pattern 运行提取过程。
|
||||||
根据 extraction_stage 判断是“第一次提取”还是“第三次提取”,
|
根据 extraction_stage 判断是“第一次提取”还是“第三次提取”,
|
||||||
@ -554,14 +585,11 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
end_page = None
|
end_page = None
|
||||||
combined_mid_pattern = None # 中间页的组合模式
|
combined_mid_pattern = None # 中间页的组合模式
|
||||||
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
|
||||||
total_pages = len(pdf_document.pages) if isinstance(pdf_document, PdfReader) else pdf_document.page_count
|
|
||||||
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
|
||||||
# ===== 遍历每一页,执行提取逻辑 =====
|
# ===== 遍历每一页,执行提取逻辑 =====
|
||||||
for i in range(end_limit):
|
for i in range(end_limit):
|
||||||
try:
|
try:
|
||||||
text = pdf_document.pages[i].extract_text() if isinstance(pdf_document,
|
text = get_text(i)
|
||||||
PdfReader) else pdf_document.load_page(i).get_text()
|
|
||||||
text = text or ""
|
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"提取第 {i} 页文本时出错: {e}")
|
print(f"提取第 {i} 页文本时出错: {e}")
|
||||||
@ -621,13 +649,17 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
path2 = path1
|
path2 = path1
|
||||||
return path1, path2
|
return path1, path2
|
||||||
|
|
||||||
|
pdf_document = None
|
||||||
|
try:
|
||||||
try:
|
try:
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
get_text = create_get_text_function('pypdf2', pdf_document)
|
||||||
total_pages = len(pdf_document.pages)
|
total_pages = len(pdf_document.pages)
|
||||||
except Exception as e_pypdf2:
|
except Exception as e_pypdf2:
|
||||||
print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||||||
try:
|
try:
|
||||||
pdf_document = fitz.open(pdf_path)
|
pdf_document = fitz.open(pdf_path)
|
||||||
|
get_text = create_get_text_function('fitz', pdf_document)
|
||||||
total_pages = pdf_document.page_count
|
total_pages = pdf_document.page_count
|
||||||
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
# print("使用 PyMuPDF 成功读取 PDF 文件。")
|
||||||
except Exception as e_fitz:
|
except Exception as e_fitz:
|
||||||
@ -641,7 +673,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
|
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
start_page, mid_page, end_page = run_extraction(pdf_document,begin_pattern, extraction_stage='first')
|
start_page, mid_page, end_page = run_extraction(begin_pattern, extraction_stage='first')
|
||||||
# 如果第一次提取成功,保存并返回路径
|
# 如果第一次提取成功,保存并返回路径
|
||||||
if start_page is not None and mid_page is not None and end_page is not None:
|
if start_page is not None and mid_page is not None and end_page is not None:
|
||||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||||
@ -664,17 +696,30 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
|
|||||||
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
|
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
|
||||||
regex.MULTILINE
|
regex.MULTILINE
|
||||||
)
|
)
|
||||||
start_page, mid_page, end_page = run_extraction(pdf_document, new_begin_pattern, extraction_stage='third')
|
start_page, mid_page, end_page = run_extraction(new_begin_pattern, extraction_stage='third')
|
||||||
if start_page is not None and mid_page is not None and end_page is not None:
|
if start_page is not None and mid_page is not None and end_page is not None:
|
||||||
return perform_extraction_and_save(start_page, mid_page, end_page)
|
return perform_extraction_and_save(start_page, mid_page, end_page)
|
||||||
else:
|
else:
|
||||||
# 所有提取尝试均失败
|
# 所有提取尝试均失败
|
||||||
print(f"三次提取tobidders_notice均失败!! {pdf_path}")
|
print(f"三次提取tobidders_notice均失败!! {pdf_path}")
|
||||||
return "", ""
|
return "", ""
|
||||||
|
except Exception as e:
|
||||||
|
print(f"处理文件 {pdf_path} 时发生错误: {e}")
|
||||||
|
return "", ""
|
||||||
|
finally:
|
||||||
|
# 无论如何都关闭 pdf_document(对于 fitz 对象)
|
||||||
|
if pdf_document and hasattr(pdf_document, "close"):
|
||||||
|
pdf_document.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
|
def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
|
||||||
|
# 使用 with open 确保文件流在读取后关闭,避免文件句柄泄露
|
||||||
|
try:
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取 PDF 文件失败: {e}")
|
||||||
|
return None, None, None, None
|
||||||
# 投标须知前附表占一章、投标须知正文占一章
|
# 投标须知前附表占一章、投标须知正文占一章
|
||||||
output_suffix = "tobidders_notice"
|
output_suffix = "tobidders_notice"
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
@ -696,16 +741,13 @@ def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
|
|||||||
if start_page1 is None or end_page1 is None:
|
if start_page1 is None or end_page1 is None:
|
||||||
return None, None, None,None
|
return None, None, None,None
|
||||||
|
|
||||||
# 提取第二部分
|
# 提取第二部分 先检查end_page1页面的内容
|
||||||
|
text = get_text_pypdf2(pdf_document,end_page1)
|
||||||
# 检查end_page1页面的内容
|
|
||||||
text = pdf_document.pages[end_page1].extract_text() or ""
|
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
match = end_pattern.search(cleaned_text)
|
match = end_pattern.search(cleaned_text)
|
||||||
|
|
||||||
if match:
|
if match:
|
||||||
# 获取匹配到的中文部分
|
# 获取匹配到的中文部分
|
||||||
chapter_title = match.group(1)
|
chapter_title = match.group(1) if match.groups() else ""
|
||||||
# 检查是否包含排除关键词
|
# 检查是否包含排除关键词
|
||||||
if any(word in chapter_title for word in exclusion_words):
|
if any(word in chapter_title for word in exclusion_words):
|
||||||
# 如果包含排除关键词,直接返回相同的路径
|
# 如果包含排除关键词,直接返回相同的路径
|
||||||
@ -722,7 +764,6 @@ def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
|
|||||||
|
|
||||||
return start_page1, end_page1, start_page2,end_page2
|
return start_page1, end_page1, start_page2,end_page2
|
||||||
|
|
||||||
#return start_page1, end_page1, end_page1
|
|
||||||
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
|
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\))]?\s*$',
|
||||||
|
@ -563,12 +563,8 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
|
|||||||
else:
|
else:
|
||||||
user_query = generate_full_user_query(output_file, user_query)
|
user_query = generate_full_user_query(output_file, user_query)
|
||||||
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
|
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
|
||||||
# file_id = upload_file(output_file)
|
num_list = process_string_list(model_ans) # 处理模型返回的序号列表
|
||||||
# model_ans = qianwen_long(file_id, user_query)
|
|
||||||
num_list = process_string_list(model_ans) # 处理模型返回的序号
|
|
||||||
print(result_key + "选中的序号:" + str(num_list))
|
print(result_key + "选中的序号:" + str(num_list))
|
||||||
# print(all_texts1_list)
|
|
||||||
# print(all_text1_items)
|
|
||||||
for index in num_list:
|
for index in num_list:
|
||||||
if 1 <= index <= len(all_texts1_list):
|
if 1 <= index <= len(all_texts1_list):
|
||||||
original_global_idx = all_text1_items[index - 1][0]
|
original_global_idx = all_text1_items[index - 1][0]
|
||||||
|
@ -3,6 +3,8 @@ import threading
|
|||||||
import fitz
|
import fitz
|
||||||
from PyPDF2 import PdfReader
|
from PyPDF2 import PdfReader
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
import queue
|
||||||
|
from queue import Queue, Empty
|
||||||
def extract_common_header(pdf_path):
|
def extract_common_header(pdf_path):
|
||||||
"""
|
"""
|
||||||
提取 PDF 文件的公共页眉。
|
提取 PDF 文件的公共页眉。
|
||||||
@ -181,36 +183,60 @@ def clean_page_content(text, common_header):
|
|||||||
text = re.sub(r'^\s*[—-]\s*(第\s*)?\d{1,3}\s*(页)?\s*[—-]\s*', '', text) # 删除形如 '—2—', '-2-', 或 '-第2页-' 的页码
|
text = re.sub(r'^\s*[—-]\s*(第\s*)?\d{1,3}\s*(页)?\s*[—-]\s*', '', text) # 删除形如 '—2—', '-2-', 或 '-第2页-' 的页码
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def read_page_text_with_timeout(page, timeout=3):
|
def read_page_text_with_timeout(page, timeout=3):
|
||||||
"""
|
"""
|
||||||
尝试在子线程里执行 page.extract_text()。
|
线程安全的带超时文本提取(改进版)
|
||||||
如果超过 `timeout` 秒没有完成,就返回 None,表示超时。
|
特性:
|
||||||
如果正常完成,就返回提取到的文本。
|
1. 使用Queue保证线程间通信安全
|
||||||
|
2. 添加双重超时控制
|
||||||
|
3. 完善的资源清理机制
|
||||||
"""
|
"""
|
||||||
done = threading.Event()
|
result_queue = Queue(maxsize=1)
|
||||||
result = []
|
done_flag = threading.Event()
|
||||||
|
|
||||||
def wrapper():
|
def extract_worker():
|
||||||
try:
|
try:
|
||||||
txt = page.extract_text() # 可能会卡住的操作
|
# 强制设置文本提取上限(防止底层库卡死)
|
||||||
result.append(txt)
|
txt = page.extract_text() or ""
|
||||||
|
result_queue.put(txt)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("提取文本时出错:", e)
|
print(f"文本提取异常: {str(e)}")
|
||||||
result.append("")
|
result_queue.put("")
|
||||||
finally:
|
finally:
|
||||||
done.set()
|
done_flag.set() # 确保事件触发
|
||||||
|
|
||||||
# 启动后台线程
|
# 启动工作线程
|
||||||
thread = threading.Thread(target=wrapper, daemon=True)
|
worker = threading.Thread(
|
||||||
thread.start()
|
target=extract_worker,
|
||||||
|
daemon=True,
|
||||||
|
name=f"PDFTextExtract-{id(page)}"
|
||||||
|
)
|
||||||
|
worker.start()
|
||||||
|
|
||||||
# 等待提取结果,超过 timeout 秒则视为超时
|
# 双重等待机制
|
||||||
finished_in_time = done.wait(timeout)
|
try:
|
||||||
if not finished_in_time:
|
# 第一阶段:等待事件标志
|
||||||
print(f"单页文本提取超时(超过 {timeout} 秒)!")
|
if done_flag.wait(timeout):
|
||||||
|
# 第二阶段:立即获取结果
|
||||||
|
return result_queue.get_nowait()
|
||||||
|
|
||||||
|
# 超时处理流程
|
||||||
|
print(f"文本提取超时({timeout}s),终止操作")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return result[0] if result else ""
|
except Empty:
|
||||||
|
# 罕见情况:事件触发但队列无数据
|
||||||
|
print("队列数据异常,返回空文本")
|
||||||
|
return ""
|
||||||
|
finally:
|
||||||
|
# 资源清理
|
||||||
|
if worker.is_alive():
|
||||||
|
try:
|
||||||
|
worker._stop() # 强制终止(慎用)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
|
def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
|
||||||
"""
|
"""
|
||||||
@ -220,58 +246,43 @@ def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
|
|||||||
3. 如果前 max_pages 页都检测完成,均无可见文本,则返回 True(认为是扫描件)。
|
3. 如果前 max_pages 页都检测完成,均无可见文本,则返回 True(认为是扫描件)。
|
||||||
4. 如果需要整体超时(overall_timeout),则在最外层加一个封装线程进行控制。
|
4. 如果需要整体超时(overall_timeout),则在最外层加一个封装线程进行控制。
|
||||||
"""
|
"""
|
||||||
def core_check(result_container, done_event):
|
result_queue = queue.Queue()
|
||||||
"""
|
done_event = threading.Event()
|
||||||
真正的核心逻辑,执行完后把结果塞进 result_container[0],
|
def core_check():
|
||||||
然后调用 done_event.set() 告知“整个检查流程结束”。
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
with open(file_path, 'rb') as f:
|
with open(file_path, 'rb') as f:
|
||||||
reader = PdfReader(f)
|
reader = PdfReader(f)
|
||||||
for i, page in enumerate(reader.pages):
|
for i, page in enumerate(reader.pages):
|
||||||
if i >= max_pages:
|
if i >= max_pages or done_event.is_set():
|
||||||
break
|
break
|
||||||
|
|
||||||
# 尝试在 page_timeout 内获取文本
|
|
||||||
text = read_page_text_with_timeout(page, page_timeout)
|
text = read_page_text_with_timeout(page, page_timeout)
|
||||||
if text is None:
|
if text is None:
|
||||||
print(f"第 {i+1} 页文本提取超时,直接判定为扫描件。")
|
print(f"Page {i + 1} timeout")
|
||||||
result_container[0] = True
|
result_queue.put(True)
|
||||||
done_event.set()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if text.strip():
|
if text.strip():
|
||||||
print(f"第 {i+1} 页检测到文本,判定为非扫描件。")
|
print(f"Text found on page {i + 1}")
|
||||||
result_container[0] = False
|
result_queue.put(False)
|
||||||
done_event.set()
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
result_queue.put(True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("处理 PDF 文件时发生异常:", e)
|
print(f"Processing error: {str(e)}")
|
||||||
result_container[0] = True
|
result_queue.put(True)
|
||||||
done_event.set()
|
finally:
|
||||||
return
|
|
||||||
|
|
||||||
print(f"前 {max_pages} 页均未检测到文本,判定为扫描件。")
|
|
||||||
result_container[0] = True
|
|
||||||
done_event.set()
|
done_event.set()
|
||||||
|
|
||||||
result_container = [None] # 用于在子线程中传递结果
|
thread = threading.Thread(target=core_check, daemon=True)
|
||||||
done = threading.Event()
|
|
||||||
thread = threading.Thread(target=core_check, args=(result_container, done), daemon=True)
|
|
||||||
thread.start()
|
thread.start()
|
||||||
|
try:
|
||||||
|
result = result_queue.get(timeout=overall_timeout)
|
||||||
|
except queue.Empty:
|
||||||
|
print(f"Overall timeout after {overall_timeout}s")
|
||||||
|
return True # 或定义特殊超时状态码
|
||||||
|
|
||||||
# 等待整体流程结束,最多等待 overall_timeout 秒
|
return result
|
||||||
finished_in_time = done.wait(overall_timeout)
|
|
||||||
if not finished_in_time:
|
|
||||||
print(f"整体检查超时(超过 {overall_timeout} 秒),返回默认结果。")
|
|
||||||
return True # 或者根据需要返回其它默认值
|
|
||||||
|
|
||||||
# 打印最终结果调试信息
|
|
||||||
if result_container[0]:
|
|
||||||
print("最终结果:认为是扫描件(未检测到有效文本或发生单页超时)")
|
|
||||||
else:
|
|
||||||
print("最终结果:认为非扫描件(检测到有效文本)")
|
|
||||||
return result_container[0]
|
|
||||||
|
|
||||||
def is_pure_image(docx_path, percentage=0.3):
|
def is_pure_image(docx_path, percentage=0.3):
|
||||||
"""
|
"""
|
||||||
|
@ -12,7 +12,9 @@ def judge_zbfile_exec(file_path):
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# 检查文件是否为PDF格式
|
# 检查文件是否为PDF格式
|
||||||
if file_path.lower().endswith('.pdf'):
|
if file_path.lower().endswith('.pdf'):
|
||||||
reader = PdfReader(file_path)
|
# 使用 with 语句确保文件关闭
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
reader = PdfReader(f)
|
||||||
num_pages = len(reader.pages)
|
num_pages = len(reader.pages)
|
||||||
if num_pages <= 5:
|
if num_pages <= 5:
|
||||||
return False
|
return False
|
||||||
@ -32,11 +34,7 @@ def judge_zbfile_exec(file_path):
|
|||||||
print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒")
|
print(f"judge_zbfile_exec实际耗时:{end_time - start_time:.2f} 秒")
|
||||||
print(f"判断是否属于招标文件:{model_res}")
|
print(f"判断是否属于招标文件:{model_res}")
|
||||||
|
|
||||||
# 根据模型返回结果判断
|
return '否' not in model_res
|
||||||
if '否' in model_res:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"处理文件时出错: {e}")
|
print(f"处理文件时出错: {e}")
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# flask_app/start_up.py
|
# flask_app/start_up.py
|
||||||
|
import os
|
||||||
|
|
||||||
from flask import Flask, g
|
from flask import Flask, g
|
||||||
from flask_app.ConnectionLimiter import ConnectionLimiter
|
from flask_app.ConnectionLimiter import ConnectionLimiter
|
||||||
@ -11,16 +12,10 @@ from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids
|
|||||||
from flask_app.routes.judge_zbfile import judge_zbfile_bp
|
from flask_app.routes.judge_zbfile import judge_zbfile_bp
|
||||||
|
|
||||||
|
|
||||||
class FlaskAppWithLimiter(Flask):
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
# 初始化一个字典来存储每个蓝图的限流器
|
|
||||||
self.connection_limiters = {}
|
|
||||||
|
|
||||||
|
|
||||||
def create_app():
|
def create_app():
|
||||||
app = FlaskAppWithLimiter(__name__)
|
|
||||||
# 创建全局日志记录器
|
# 创建全局日志记录器
|
||||||
|
app = Flask(__name__)
|
||||||
app.global_logger = create_logger_main('global_log') # 全局日志记录器
|
app.global_logger = create_logger_main('global_log') # 全局日志记录器
|
||||||
# 注册蓝图
|
# 注册蓝图
|
||||||
app.register_blueprint(get_deviation_bp)
|
app.register_blueprint(get_deviation_bp)
|
||||||
@ -28,11 +23,14 @@ def create_app():
|
|||||||
app.register_blueprint(upload_bp)
|
app.register_blueprint(upload_bp)
|
||||||
app.register_blueprint(test_zbparse_bp)
|
app.register_blueprint(test_zbparse_bp)
|
||||||
app.register_blueprint(judge_zbfile_bp)
|
app.register_blueprint(judge_zbfile_bp)
|
||||||
# app.connection_limiters['upload'] = ConnectionLimiter(max_connections=100)
|
|
||||||
# app.connection_limiters['get_deviation'] = ConnectionLimiter(max_connections=100)
|
|
||||||
# app.connection_limiters['default'] = ConnectionLimiter(max_connections=100)
|
|
||||||
# app.connection_limiters['judge_zbfile'] = ConnectionLimiter(max_connections=100)
|
|
||||||
|
|
||||||
|
@app.teardown_request
|
||||||
|
def check_fd(exception=None):
|
||||||
|
logger = g.logger
|
||||||
|
end_fd = os.listdir('/proc/self/fd')
|
||||||
|
leaked = set(end_fd) - set(g.start_fd)
|
||||||
|
if leaked:
|
||||||
|
logger.warning(f"潜在FD泄漏: {len(leaked)}个未关闭")
|
||||||
# @app.teardown_request
|
# @app.teardown_request
|
||||||
# def teardown_request(exception):
|
# def teardown_request(exception):
|
||||||
# # 接口请求之后都会执行该代码,做一些清理工作
|
# # 接口请求之后都会执行该代码,做一些清理工作
|
||||||
|
@ -29,6 +29,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
|||||||
处理 PDF 文件,作为 truncate_pdf_main 的后备方法。
|
处理 PDF 文件,作为 truncate_pdf_main 的后备方法。
|
||||||
此函数将在所有模式对都失败后调用。
|
此函数将在所有模式对都失败后调用。
|
||||||
"""
|
"""
|
||||||
|
pdf_document=None
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
@ -109,6 +110,9 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in extract_pages_twice: {e}")
|
print(f"Error in extract_pages_twice: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
finally:
|
||||||
|
if pdf_document and hasattr(pdf_document, 'close'):
|
||||||
|
pdf_document.close()
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
|
def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
|
||||||
|
@ -55,7 +55,6 @@ def get_patterns_for_evaluation_method():
|
|||||||
return begin_pattern, end_pattern
|
return begin_pattern, end_pattern
|
||||||
|
|
||||||
def extract_pages_qualification(pdf_path, begin_page, common_header):
|
def extract_pages_qualification(pdf_path, begin_page, common_header):
|
||||||
pdf_document = PdfReader(pdf_path)
|
|
||||||
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
|
||||||
begin_pattern = regex.compile(
|
begin_pattern = regex.compile(
|
||||||
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)(?!的)',
|
r'^(?:附录(?:一|1)?[::]?|附件(?:一|1)?[::]?|附表(?:一|1)?[::]?)(?!的)',
|
||||||
@ -84,7 +83,8 @@ def extract_pages_qualification(pdf_path, begin_page, common_header):
|
|||||||
end_page = None
|
end_page = None
|
||||||
include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"]
|
include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"]
|
||||||
exclude_keywords = ["声明函", "承诺函"]
|
exclude_keywords = ["声明函", "承诺函"]
|
||||||
|
pdf_document = None
|
||||||
|
try:
|
||||||
try:
|
try:
|
||||||
# 尝试使用 PyPDF2 读取 PDF
|
# 尝试使用 PyPDF2 读取 PDF
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
@ -143,6 +143,9 @@ def extract_pages_qualification(pdf_path, begin_page, common_header):
|
|||||||
print(f"qualification:找到结束页 (章节标题): {end_page}")
|
print(f"qualification:找到结束页 (章节标题): {end_page}")
|
||||||
break # 找到结束页后退出循环
|
break # 找到结束页后退出循环
|
||||||
return start_page, end_page
|
return start_page, end_page
|
||||||
|
finally:
|
||||||
|
if pdf_document and hasattr(pdf_document, 'close'):
|
||||||
|
pdf_document.close()
|
||||||
|
|
||||||
|
|
||||||
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
|
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
|
||||||
@ -322,8 +325,8 @@ if __name__ == "__main__":
|
|||||||
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
|
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
|
||||||
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
|
||||||
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
|
||||||
output_folder = r"C:\Users\Administrator\Desktop\货物标\output5"
|
output_folder = r"C:\Users\Administrator\Desktop\货物标\output4"
|
||||||
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
|
||||||
selection = 5 # 例如:1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
|
selection = 4 # 例如:1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
|
||||||
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
|
||||||
print(generated_files)
|
print(generated_files)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user