2.14 尝试解决内存泄漏问题

This commit is contained in:
zy123 2025-02-14 15:46:07 +08:00
parent cef8ff5415
commit 979d7fac21
11 changed files with 387 additions and 328 deletions

View File

@ -9,12 +9,10 @@ from flask_app.general.读取文件.clean_pdf import is_scanned_pdf, is_pure_ima
def download_file(url, local_filename,enable=False):
"""
下载文件并保存到本地基于Content-Type设置文件扩展名
参数:
- url (str): 文件的URL地址
- local_filename (str): 本地保存的文件名不含扩展名
- enable: 是否需要判断为扫描型/纯图片文件
返回:
- tuple: (完整文件名, 文件类型代码)
文件类型代码:
@ -31,10 +29,10 @@ def download_file(url, local_filename,enable=False):
# 获取Content-Type并猜测文件扩展名
content_type = response.headers.get('Content-Type', '')
extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
# 分离文件名和现有扩展
base, ext = os.path.splitext(local_filename)
if ext.lower() != extension:
extension_lower = extension.lower() #显式地将 extension 转换为小写(存入 extension_lower更加健壮
# 根据传入的 local_filename 构造完整的文件
base, current_ext = os.path.splitext(local_filename)
if current_ext.lower() != extension_lower:
full_filename = base + extension
else:
full_filename = local_filename
@ -94,6 +92,7 @@ def local_file_2_url(file_path, url):
print("format_change 文件上传成功")
receive_file_response = response.content.decode('utf-8')
receive_file_json = json.loads(receive_file_response)
print(receive_file_json)
receive_file_url = receive_file_json["data"]
else:
@ -207,17 +206,17 @@ def get_pdf_page_count(file_path):
if __name__ == '__main__':
# 替换为你的文件路径和API URL
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
# local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
local_path_in = r"C:\Users\Administrator\Downloads\基础公司枞阳经开区新能源汽车零部件产业园基础设施建设项目二期桩基工程劳务分包.doc"
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
# local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
# local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
intermediate_docx = pdf2docx(local_path_in)
if intermediate_docx:
normal_pdf = docx2pdf(intermediate_docx, force=True)
# local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
# intermediate_docx = pdf2docx(local_path_in)
# if intermediate_docx:
# normal_pdf = docx2pdf(intermediate_docx, force=True)
# # downloaded_file=pdf2docx(local_path_in)
# downloaded_file=docx2pdf(local_path_in)
# print(downloaded_file)
downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)
# test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
# local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'

View File

@ -23,37 +23,33 @@ def insert_mark(input_pdf_path):
# 遍历每一页
for page_num in range(total_pages):
# 添加原始页面
page = pdf_reader.pages[page_num]
pdf_writer.add_page(page)
# 创建一个内存中的PDF用于存放带有文本的空白页
packet = BytesIO()
# 获取当前页面的宽度和高度
page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
# 使用reportlab创建一个新的PDF页面
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
# 使用 BytesIO 的上下文管理器创建内存中的 PDF 页面用于标记
with BytesIO() as packet:
# 获取当前页面的宽度和高度
page_width = float(page.mediabox.width)
page_height = float(page.mediabox.height)
# 使用 reportlab 创建一个新的 PDF 页面,页面大小与当前页面相同
c = canvas.Canvas(packet, pagesize=(page_width, page_height))
# 计算文本的位置单位1厘米 ≈ 28.35点)
x_position = 2.3 * cm
y_position = page_height - 0.5 * cm # 从顶部开始计算,因此用页面高度减去上边距
# 计算文本位置:距左边距 2.3cm,距顶部 0.5cm
x_position = 2.3 * cm
y_position = page_height - 0.5 * cm
c.setFont("Helvetica", 12)
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
# 绘制文本,使用 (page_num + 1) 作为索引
c.setFont("Helvetica", 12) # 设置字体和大小
c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
# 完成绘制,将内容写入 BytesIO 流
c.save()
# 完成绘制
c.save()
# 将内存中的PDF读入PyPDF2
packet.seek(0)
new_pdf = PdfReader(packet)
# blank_page = new_pdf.pages[0]
blank_page = copy.deepcopy(new_pdf.pages[0])
packet.truncate(0)
packet.seek(0)
# 将带有文本的空白页添加到写入器
pdf_writer.add_page(blank_page)
# 重置流位置,读取刚刚生成的 PDF 页面
packet.seek(0)
new_pdf = PdfReader(packet)
# 使用 deepcopy 确保页面对象独立,避免后续修改影响已添加页面
blank_page = copy.deepcopy(new_pdf.pages[0])
pdf_writer.add_page(blank_page)
# 将所有页面写入输出的PDF文件
with open(output_pdf_path, 'wb') as output_file:
@ -65,10 +61,14 @@ def insert_mark(input_pdf_path):
print(f"发生错误: {e}")
return input_pdf_path
def delete_mark(docx_path):
"""
删除文档中包含标记文本 "[$$index_mark_" 的段落
如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落也一并删除
修改后的文档保存为 invalid_del.docx并返回新文件路径
"""
try:
docx = Document(docx_path)
doc = Document(docx_path)
except KeyError as e:
print(f"Error opening document: {e}")
return ""
@ -76,30 +76,41 @@ def delete_mark(docx_path):
print(f"Invalid package: {e}")
return ""
# 继续处理文档
find_flag = False
for para in docx.paragraphs:
# 匹配标记: [$$index_mark_X$$]
# 收集所有需要删除的段落,避免在遍历时直接修改列表
paragraphs_to_remove = []
paragraphs = doc.paragraphs
i = 0
while i < len(paragraphs):
para = paragraphs[i]
if "[$$index_mark_" in para.text:
para._element.getparent().remove(para._element) # 删标记
find_flag = True
if find_flag and "w:sectPr" in para._element.xml: # 删空白分节符
para._element.getparent().remove(para._element)
find_flag = False
dir_path = os.path.dirname(docx_path)
new_file_path = os.path.join(dir_path, 'invalid_del.docx')
paragraphs_to_remove.append(para)
# 如果当前段落后面有段落且该段落的 XML 中包含 "w:sectPr",也删除该段落
if i + 1 < len(paragraphs):
next_para = paragraphs[i + 1]
if "w:sectPr" in next_para._element.xml:
paragraphs_to_remove.append(next_para)
i += 1 # 跳过下一个段落,避免重复处理
i += 1
# 保存修改后的文档
docx.save(new_file_path)
# 从文档 XML 中移除收集到的段落
for para in paragraphs_to_remove:
parent = para._element.getparent()
if parent is not None:
parent.remove(para._element)
new_file_path = os.path.join(os.path.dirname(docx_path), 'invalid_del.docx')
doc.save(new_file_path)
return new_file_path
if __name__ == '__main__':
# input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\ztbfile_invalid.pdf'
output=insert_mark(input)
# doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
# res=delete_mark(doc_path)
# if res:
# print(res)
# else:
# print("No")
# input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf'
# output=insert_mark(input)
doc_path = r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\invalid_added.docx'
res=delete_mark(doc_path)
if res:
print(res)
else:
print("No")

View File

@ -6,9 +6,6 @@ import time
from pathlib import Path
from openai import OpenAI
import os
from flask_app.general.llm.大模型通用函数 import get_total_tokens
from flask_app.general.llm.qianwen_turbo import qianwen_turbo
from flask_app.general.llm.大模型通用函数 import extract_error_details, shared_rate_limit
file_write_lock = threading.Lock()

View File

@ -1,8 +1,6 @@
import os
import fitz
from PyPDF2 import PdfReader, PdfWriter
from flask_app.general.截取pdf通用函数 import create_get_text_function
@ -67,6 +65,7 @@ def merge_with_fitz(paths, output_path):
for path in paths:
if not path.strip():
continue
pdf_doc = None
try:
pdf_doc = fitz.open(path)
get_text = create_get_text_function('fitz', pdf_doc)
@ -88,7 +87,9 @@ def merge_with_fitz(paths, output_path):
pdf_doc.close()
except Exception as e:
print(f"文件 '{path}' 无法使用 fitz 处理,错误: {e}")
continue
finally:
if pdf_doc is not None:
pdf_doc.close()
if merged_pdf.page_count == 0:
merged_pdf.close()

View File

@ -1,5 +1,6 @@
# flask_app/general/截取pdf通用函数.py
import concurrent.futures
import io
import os
import fitz
@ -37,8 +38,10 @@ def get_start_and_common_header(input_path, end_page):
- common_header: 公共页眉内容
- last_begin_index: 起始页的页码索引
"""
pdf_document = None
try:
# 提取公共页眉
common_header = extract_common_header(input_path) # 假设该函数已定义
last_begin_index = -1
@ -48,7 +51,6 @@ def get_start_and_common_header(input_path, end_page):
)
# 新增目录匹配模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
try:
# 尝试使用 PyPDF2 读取 PDF
pdf_document = PdfReader(input_path)
@ -86,58 +88,77 @@ def get_start_and_common_header(input_path, end_page):
except Exception as e:
print(f"Error in get_start_and_common_header: {e}")
return "", -1 # 根据需求调整返回值
finally:
# 如果使用的是 PyMuPDF记得关闭文档
if pdf_document and hasattr(pdf_document, "close"):
pdf_document.close()
def check_pdf_pages(pdf_path, mode, logger):
pdf_document = None
try:
try:
# 尝试使用 PyPDF2 读取 PDF新版可直接传入文件路径
pdf_document = PdfReader(pdf_path)
num_pages = len(pdf_document.pages)
# logger.info(f"使用 PyPDF2 成功读取 PDF '{pdf_path}' 的总页数为: {num_pages}")
except Exception as e_pypdf2:
print(f"check_pdf_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 如果 PyPDF2 读取失败,尝试使用 PyMuPDF 读取 PDF
pdf_document = fitz.open(pdf_path)
num_pages = pdf_document.page_count
if num_pages <= 30:
logger.info("PDFinvalid_path页数小于或等于30页跳过切分逻辑。")
if mode == 'goods':
return True, ['', '', '', '', '', '', pdf_path, '']
else:
return True, ['', '', '', '', '', pdf_path, '']
# 若页数大于30页返回None表示继续处理
# 若页数大于30页返回 False 表示继续处理
return False, []
except Exception as e:
logger.error(f"无法读取 PDF 页数: {e}")
# 返回空列表意味着无法执行后续处理逻辑
if mode == 'goods':
return True, ['', '', '', '', '', '', pdf_path, '']
else:
return True, ['', '', '', '', '', pdf_path, '']
finally:
# 如果使用的是 PyMuPDF 或其他需要关闭的对象,确保关闭资源
if pdf_document and hasattr(pdf_document, "close"):
pdf_document.close()
def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
pdf_document = None
using_pypdf2 = False
try:
if start_page is None or end_page is None:
print("错误: start_page 或 end_page 为 None")
return ""
# 尝试使用 PyPDF2 读取 PDF
# 尝试使用 PyPDF2 读取 PDF(新版可以直接传入路径)
try:
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
get_text = lambda p: pdf_document.pages[p].extract_text()
# 定义提取文本函数
get_text = create_get_text_function('pypdf2', pdf_document)
using_pypdf2 = True
# print("使用 PyPDF2 成功读取 PDF 文件。")
# print("使用 PyPDF2 成功读取 PDF 文件。")
except Exception as e_pypdf2:
print(f"save_extracted_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 尝试使用 PyMuPDF 读取 PDF
print(f"save_extracted_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
try:
pdf_document = fitz.open(pdf_path)
total_pages = pdf_document.page_count
get_text = lambda p: pdf_document.load_page(p).get_text()
get_text = create_get_text_function('fitz', pdf_document)
using_pypdf2 = False
# print("使用 PyMuPDF 成功读取 PDF 文件。")
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
print(f"save_extracted_pages: 使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return ""
# 检查页面范围的有效性
@ -148,6 +169,7 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
# 如果 output_suffix 为 'notice' 并且 start_page > 0则另存“目录前”的页面
if output_suffix == 'notice' and start_page > 0:
before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
toc_page = -1
@ -158,6 +180,7 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
except Exception as e_extract:
print(f"提取第 {page_num} 页文本时出错: {e_extract}")
continue
cleaned_text = clean_page_content(text, common_header)
if regex.search(r'\s*录', cleaned_text, regex.MULTILINE):
toc_page = page_num
@ -204,8 +227,8 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
continue
try:
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
with open(output_pdf_path, 'wb') as f_out:
output_doc.write(f_out)
print(f"{output_suffix} 已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
except Exception as e_write:
@ -213,24 +236,28 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
return ""
else:
output_doc = fitz.open()
for page_num in range(start_page, end_page + 1):
try:
output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
except Exception as e_page:
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
continue
try:
for page_num in range(start_page, end_page + 1):
try:
output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
except Exception as e_page:
print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
continue
output_doc.save(output_pdf_path)
output_doc.close()
print(f"{output_suffix} 已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
except Exception as e_write:
print(f"保存截取的页面时出错: {e_write}")
return ""
finally:
output_doc.close()
except Exception as e:
print(f"发生未知错误: {e}")
return ""
finally:
# 如果使用的是 PyMuPDF 或其他需要关闭的对象,确保关闭资源
if pdf_document and not using_pypdf2 and hasattr(pdf_document, "close"):
pdf_document.close()
def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档
@ -242,14 +269,14 @@ def convert_to_pdf(file_path):
return docx2pdf(file_path)
return file_path
def get_invalid_file(file_path, output_folder, common_header, begin_page):
def get_invalid_file(pdf_path, output_folder, common_header, begin_page):
"""
从指定的PDF文件中提取无效部分并保存到输出文件夹中
begin_pattern匹配开头 end_pattern匹配结尾
页数小于100不进行begin_pattern默认start_page=0
页数大于200时end_pattern从前往后匹配
Args:
file_path (str): 输入的PDF文件路径
pdf_path (str): 输入的PDF文件路径
output_folder (str): 提取后文件的输出文件夹路径
common_header (str): 公共头部文本用于清理每页的内容
@ -284,23 +311,22 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
pdf_document = None
try:
# 打开PDF文件
# 尝试使用 PyPDF2 读取 PDF
try:
pdf_document = PdfReader(file_path)
pdf_lib = 'pypdf2'
# 获取总页数(对于 pypdf2 的 PDFDocument
pdf_document = PdfReader(pdf_path)
get_text = create_get_text_function('pypdf2', pdf_document)
total_pages = len(pdf_document.pages)
except Exception as e:
print(f"get_invalid_file:使用 pypdf2 读取失败,错误信息: {e},切换至 fitz。")
pdf_document = fitz.open(file_path)
pdf_lib = 'fitz'
# 获取总页数(对于 fitz 的 PDFDocument
pdf_document = fitz.open(pdf_path)
get_text = create_get_text_function('fitz', pdf_document)
total_pages = pdf_document.page_count
if total_pages <= 50:
print(f"PDF页数({total_pages}) <= 50直接返回文件路径:{file_path}")
return file_path, 0
get_text = create_get_text_function(pdf_lib, pdf_document)
print(f"PDF页数({total_pages}) <= 50直接返回文件路径: {pdf_path}")
return pdf_path, 0
# 提取并清理每页的文本内容
page_texts = []
for i in range(total_pages):
@ -308,11 +334,12 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
cleaned_text = clean_page_content(text, common_header) if text else ""
page_texts.append(cleaned_text)
# 定义查找起始页的函数
# 内部函数:查找起始页
def find_start_page(begin_page):
for pattern in begin_patterns:
for pattern in begin_patterns: # 需提前定义好
for i in range(begin_page, min(begin_page + 30, total_pages)):
text = page_texts[i]
# 若存在排除模式则跳过
if regex.search(exclusion_pattern, text):
continue
if regex.search(pattern, text):
@ -321,22 +348,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
# print(f"未在前30页找到模式: {pattern.pattern}")
return 0 # 默认从第一页开始
# 定义查找结束页的函数
# 内部函数:查找结束页
def find_end_page():
if total_pages > 200:
# print("总页数大于200页结束页从前往后查找跳过前30页。") #且先匹配合同
for pattern in end_patterns:
# 当页数较多时从第31页开始正向查找结束模式
for pattern in end_patterns: # 需提前定义好
for i in range(30, total_pages):
text = page_texts[i]
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$', regex.MULTILINE)
if regex.search(exclusion_pattern,text):
# 定义局部的排除模式
exclusion_pat = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$',
regex.MULTILINE
)
if regex.search(exclusion_pat, text):
continue
if regex.search(pattern, text):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
else:
# print("结束页从后往前查找确保只剩前30页时停止。")
# 当页数较少时从后向前查找至少保留前30页
for pattern in end_patterns:
for i in range(total_pages - 1, 29, -1):
text = page_texts[i]
@ -344,77 +374,76 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
# print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
return i
# 如果没有匹配到设置end_page逻辑
# 若未匹配到结束模式,设置默认结束页
if total_pages > 200:
print(f"未找到结束模式总页数大于200平滑调整结束页为 {end_page}" + file_path)
print(f"未找到结束模式总页数大于200平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}{pdf_path}")
return max(100, int(total_pages * 1 / 2))
elif 100 < total_pages <= 200:
print("未找到结束模式总页数在100到200之间设置结束页为总页数的三分之二。" + file_path)
print("未找到结束模式总页数在100到200之间设置结束页为总页数的三分之二。 " + pdf_path)
return max(100, int(total_pages * 2 / 3))
else:
print("未找到结束模式,设置结束页为最后一页。" + file_path)
print("未找到结束模式,设置结束页为最后一页。 " + pdf_path)
return total_pages - 1
# 根据总页数决定是否查找起始页
# 根据总页数决定是否并发查找起始页与结束
if total_pages < 100:
# print("总页数少于100页默认起始页为第一页。")
start_page = 0
start_page_found = 0
with concurrent.futures.ThreadPoolExecutor() as executor:
future_end = executor.submit(find_end_page)
end_page = future_end.result()
end_page_found = future_end.result()
else:
with concurrent.futures.ThreadPoolExecutor() as executor:
future_start = executor.submit(find_start_page, begin_page)
future_end = executor.submit(find_end_page)
start_page = future_start.result()
end_page = future_end.result()
# print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
# 验证页码范围
if start_page > end_page:
print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
start_page_found = future_start.result()
end_page_found = future_end.result()
# 检查页码范围是否有效
if start_page_found > end_page_found:
print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})")
return "", 0
# 调用已实现的保存函数
# 调用已实现的保存函数,保存提取的页面
output_path = save_extracted_pages(
pdf_path=file_path,
pdf_path=pdf_path,
output_folder=output_folder,
start_page=start_page,
end_page=end_page,
start_page=start_page_found,
end_page=end_page_found,
output_suffix="invalid",
common_header=common_header
)
# print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
return output_path, end_page
return output_path, end_page_found
except Exception as e:
print(f"处理文件 {file_path} 时发生错误: {e}")
print(f"处理文件 {pdf_path} 时发生错误: {e}")
return "", 0
finally:
# 确保释放资源(对 fitz 的 pdf_document 需要关闭)
if pdf_document and hasattr(pdf_document, "close"):
pdf_document.close()
def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
output_suffix="normal", invalid_endpage=-1):
start_page = None
end_page = None
flag = True
# 先尝试使用 PyPDF2
try:
# 尝试使用 PyPDF2 读取 PDF
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
# print("使用 PyPDF2 成功读取 PDF 文件。")
except Exception as e_pypdf2:
print(f"extract_pages_generic:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 如果 PyPDF2 失败,就再试 PyMuPDF
try:
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
pdf_document = fitz.open(pdf_path)
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
# print("使用 PyMuPDF 成功读取 PDF 文件。")
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return None, None # 或者根据需求抛出异常
return None, None
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
for i in range(begin_page, end_limit):
text = get_text(i)
@ -447,7 +476,9 @@ def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, comm
if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
end_page = i
break
# 如果使用的是 PyMuPDF需要手动关闭文档
if pdf_document and isinstance(pdf_document, fitz.Document):
pdf_document.close()
return start_page, end_page
@ -534,7 +565,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
"""
exclusion_pattern = regex.compile(
r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
def run_extraction(pdf_document,begin_pattern, extraction_stage='first'):
def run_extraction(begin_pattern, extraction_stage='first'):
"""
使用提供的 begin_pattern 运行提取过程
根据 extraction_stage 判断是第一次提取还是第三次提取
@ -554,14 +585,11 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
end_page = None
combined_mid_pattern = None # 中间页的组合模式
catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
total_pages = len(pdf_document.pages) if isinstance(pdf_document, PdfReader) else pdf_document.page_count
end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
# ===== 遍历每一页,执行提取逻辑 =====
for i in range(end_limit):
try:
text = pdf_document.pages[i].extract_text() if isinstance(pdf_document,
PdfReader) else pdf_document.load_page(i).get_text()
text = text or ""
text = get_text(i)
cleaned_text = clean_page_content(text, common_header)
except Exception as e:
print(f"提取第 {i} 页文本时出错: {e}")
@ -621,60 +649,77 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
path2 = path1
return path1, path2
pdf_document = None
try:
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
except Exception as e_pypdf2:
print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
try:
pdf_document = fitz.open(pdf_path)
total_pages = pdf_document.page_count
# print("使用 PyMuPDF 成功读取 PDF 文件。")
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return "", "" # 或者根据需求抛出异常
pdf_document = PdfReader(pdf_path)
get_text = create_get_text_function('pypdf2', pdf_document)
total_pages = len(pdf_document.pages)
except Exception as e_pypdf2:
print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
try:
pdf_document = fitz.open(pdf_path)
get_text = create_get_text_function('fitz', pdf_document)
total_pages = pdf_document.page_count
# print("使用 PyMuPDF 成功读取 PDF 文件。")
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return "", "" # 或者根据需求抛出异常
# 第一次提取尝试,使用初始的 begin_pattern
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$|'
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
regex.MULTILINE
)
start_page, mid_page, end_page = run_extraction(pdf_document,begin_pattern, extraction_stage='first')
# 如果第一次提取成功,保存并返回路径
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
# 第一次提取尝试,使用初始的 begin_pattern
begin_pattern = regex.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$|'
r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
regex.MULTILINE
)
start_page, mid_page, end_page = run_extraction(begin_pattern, extraction_stage='first')
# 如果第一次提取成功,保存并返回路径
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
print(f"第一次提取tobidders_notice失败尝试第二次提取: {pdf_path}")
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page)
print(f"第一次提取tobidders_notice失败尝试第二次提取: {pdf_path}")
start_page1, end_page1, start_page2, end_page2 = extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page)
if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
if end_page1==start_page2:
mid_page=end_page1
return perform_extraction_and_save(start_page1, mid_page, end_page2)
if start_page1 is not None and end_page1 is not None and start_page2 is not None and end_page2 is not None:
if end_page1==start_page2:
mid_page=end_page1
return perform_extraction_and_save(start_page1, mid_page, end_page2)
else:
path1=save_extracted_pages(pdf_path,output_folder,start_page1,end_page1,"tobidders_notice_part1",common_header)
path2=save_extracted_pages(pdf_path,output_folder,start_page2,end_page2,"tobidders_notice_part2",common_header)
return path1,path2
print("第二次提取tobidders_notice失败尝试第三次提取!")
new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
regex.MULTILINE
)
start_page, mid_page, end_page = run_extraction(new_begin_pattern, extraction_stage='third')
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
else:
path1=save_extracted_pages(pdf_path,output_folder,start_page1,end_page1,"tobidders_notice_part1",common_header)
path2=save_extracted_pages(pdf_path,output_folder,start_page2,end_page2,"tobidders_notice_part2",common_header)
return path1,path2
print("第二次提取tobidders_notice失败尝试第三次提取!")
new_begin_pattern = regex.compile(
r'.*(?:投标人|磋商|谈判|供应商|应答人|比选申请人).*须知\s*$|'
r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
regex.MULTILINE
)
start_page, mid_page, end_page = run_extraction(pdf_document, new_begin_pattern, extraction_stage='third')
if start_page is not None and mid_page is not None and end_page is not None:
return perform_extraction_and_save(start_page, mid_page, end_page)
else:
# 所有提取尝试均失败
print(f"三次提取tobidders_notice均失败!! {pdf_path}")
# 所有提取尝试均失败
print(f"三次提取tobidders_notice均失败!! {pdf_path}")
return "", ""
except Exception as e:
print(f"处理文件 {pdf_path} 时发生错误: {e}")
return "", ""
finally:
# 无论如何都关闭 pdf_document对于 fitz 对象)
if pdf_document and hasattr(pdf_document, "close"):
pdf_document.close()
def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
pdf_document=PdfReader(pdf_path)
# 使用 with open 确保文件流在读取后关闭,避免文件句柄泄露
try:
pdf_document = PdfReader(pdf_path)
except Exception as e:
print(f"读取 PDF 文件失败: {e}")
return None, None, None, None
# 投标须知前附表占一章、投标须知正文占一章
output_suffix = "tobidders_notice"
begin_pattern = regex.compile(
@ -696,16 +741,13 @@ def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
if start_page1 is None or end_page1 is None:
return None, None, None,None
# 提取第二部分
# 检查end_page1页面的内容
text = pdf_document.pages[end_page1].extract_text() or ""
# 提取第二部分 先检查end_page1页面的内容
text = get_text_pypdf2(pdf_document,end_page1)
cleaned_text = clean_page_content(text, common_header)
match = end_pattern.search(cleaned_text)
if match:
# 获取匹配到的中文部分
chapter_title = match.group(1)
chapter_title = match.group(1) if match.groups() else ""
# 检查是否包含排除关键词
if any(word in chapter_title for word in exclusion_words):
# 如果包含排除关键词,直接返回相同的路径
@ -722,7 +764,6 @@ def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
return start_page1, end_page1, start_page2,end_page2
#return start_page1, end_page1, end_page1
def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
begin_pattern = regex.compile(
r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)]?\s*$',

View File

@ -563,12 +563,8 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
else:
user_query = generate_full_user_query(output_file, user_query)
model_ans = qianwen_plus(user_query) # 豆包模型返回结果
# file_id = upload_file(output_file)
# model_ans = qianwen_long(file_id, user_query)
num_list = process_string_list(model_ans) # 处理模型返回的序号
num_list = process_string_list(model_ans) # 处理模型返回的序号列表
print(result_key + "选中的序号:" + str(num_list))
# print(all_texts1_list)
# print(all_text1_items)
for index in num_list:
if 1 <= index <= len(all_texts1_list):
original_global_idx = all_text1_items[index - 1][0]

View File

@ -3,6 +3,8 @@ import threading
import fitz
from PyPDF2 import PdfReader
from docx import Document
import queue
from queue import Queue, Empty
def extract_common_header(pdf_path):
"""
提取 PDF 文件的公共页眉
@ -181,36 +183,60 @@ def clean_page_content(text, common_header):
text = re.sub(r'^\s*[—-]\s*(第\s*)?\d{1,3}\s*(页)?\s*[—-]\s*', '', text) # 删除形如 '—2—', '-2-', 或 '-第2页-' 的页码
return text
def read_page_text_with_timeout(page, timeout=3):
"""
尝试在子线程里执行 page.extract_text()
如果超过 `timeout` 秒没有完成就返回 None表示超时
如果正常完成就返回提取到的文本
线程安全的带超时文本提取改进版
特性
1. 使用Queue保证线程间通信安全
2. 添加双重超时控制
3. 完善的资源清理机制
"""
done = threading.Event()
result = []
result_queue = Queue(maxsize=1)
done_flag = threading.Event()
def wrapper():
def extract_worker():
try:
txt = page.extract_text() # 可能会卡住的操作
result.append(txt)
# 强制设置文本提取上限(防止底层库卡死)
txt = page.extract_text() or ""
result_queue.put(txt)
except Exception as e:
print("提取文本时出错:", e)
result.append("")
print(f"文本提取异常: {str(e)}")
result_queue.put("")
finally:
done.set()
done_flag.set() # 确保事件触发
# 启动后台线程
thread = threading.Thread(target=wrapper, daemon=True)
thread.start()
# 启动工作线程
worker = threading.Thread(
target=extract_worker,
daemon=True,
name=f"PDFTextExtract-{id(page)}"
)
worker.start()
# 等待提取结果,超过 timeout 秒则视为超时
finished_in_time = done.wait(timeout)
if not finished_in_time:
print(f"单页文本提取超时(超过 {timeout} 秒)!")
# 双重等待机制
try:
# 第一阶段:等待事件标志
if done_flag.wait(timeout):
# 第二阶段:立即获取结果
return result_queue.get_nowait()
# 超时处理流程
print(f"文本提取超时({timeout}s终止操作")
return None
return result[0] if result else ""
except Empty:
# 罕见情况:事件触发但队列无数据
print("队列数据异常,返回空文本")
return ""
finally:
# 资源清理
if worker.is_alive():
try:
worker._stop() # 强制终止(慎用)
except Exception:
pass
def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
"""
@ -220,58 +246,43 @@ def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
3. 如果前 max_pages 页都检测完成均无可见文本则返回 True认为是扫描件
4. 如果需要整体超时overall_timeout则在最外层加一个封装线程进行控制
"""
def core_check(result_container, done_event):
"""
真正的核心逻辑执行完后把结果塞进 result_container[0]
然后调用 done_event.set() 告知整个检查流程结束
"""
result_queue = queue.Queue()
done_event = threading.Event()
def core_check():
try:
with open(file_path, 'rb') as f:
reader = PdfReader(f)
for i, page in enumerate(reader.pages):
if i >= max_pages:
if i >= max_pages or done_event.is_set():
break
# 尝试在 page_timeout 内获取文本
text = read_page_text_with_timeout(page, page_timeout)
if text is None:
print(f"{i+1} 页文本提取超时,直接判定为扫描件。")
result_container[0] = True
done_event.set()
print(f"Page {i + 1} timeout")
result_queue.put(True)
return
if text.strip():
print(f"{i+1} 页检测到文本,判定为非扫描件。")
result_container[0] = False
done_event.set()
print(f"Text found on page {i + 1}")
result_queue.put(False)
return
result_queue.put(True)
except Exception as e:
print("处理 PDF 文件时发生异常:", e)
result_container[0] = True
print(f"Processing error: {str(e)}")
result_queue.put(True)
finally:
done_event.set()
return
print(f"{max_pages} 页均未检测到文本,判定为扫描件。")
result_container[0] = True
done_event.set()
result_container = [None] # 用于在子线程中传递结果
done = threading.Event()
thread = threading.Thread(target=core_check, args=(result_container, done), daemon=True)
thread = threading.Thread(target=core_check, daemon=True)
thread.start()
try:
result = result_queue.get(timeout=overall_timeout)
except queue.Empty:
print(f"Overall timeout after {overall_timeout}s")
return True # 或定义特殊超时状态码
# 等待整体流程结束,最多等待 overall_timeout 秒
finished_in_time = done.wait(overall_timeout)
if not finished_in_time:
print(f"整体检查超时(超过 {overall_timeout} 秒),返回默认结果。")
return True # 或者根据需要返回其它默认值
# 打印最终结果调试信息
if result_container[0]:
print("最终结果:认为是扫描件(未检测到有效文本或发生单页超时)")
else:
print("最终结果:认为非扫描件(检测到有效文本)")
return result_container[0]
return result
def is_pure_image(docx_path, percentage=0.3):
"""

View File

@ -12,8 +12,10 @@ def judge_zbfile_exec(file_path):
start_time = time.time()
# 检查文件是否为PDF格式
if file_path.lower().endswith('.pdf'):
reader = PdfReader(file_path)
num_pages = len(reader.pages)
# 使用 with 语句确保文件关闭
with open(file_path, 'rb') as f:
reader = PdfReader(f)
num_pages = len(reader.pages)
if num_pages <= 5:
return False
# 模拟使用大模型进行判断
@ -32,11 +34,7 @@ def judge_zbfile_exec(file_path):
print(f"judge_zbfile_exec实际耗时{end_time - start_time:.2f}")
print(f"判断是否属于招标文件:{model_res}")
# 根据模型返回结果判断
if '' in model_res:
return False
else:
return True
return '' not in model_res
except Exception as e:
print(f"处理文件时出错: {e}")

View File

@ -1,4 +1,5 @@
# flask_app/start_up.py
import os
from flask import Flask, g
from flask_app.ConnectionLimiter import ConnectionLimiter
@ -11,16 +12,10 @@ from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids
from flask_app.routes.judge_zbfile import judge_zbfile_bp
class FlaskAppWithLimiter(Flask):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 初始化一个字典来存储每个蓝图的限流器
self.connection_limiters = {}
def create_app():
app = FlaskAppWithLimiter(__name__)
# 创建全局日志记录器
app = Flask(__name__)
app.global_logger = create_logger_main('global_log') # 全局日志记录器
# 注册蓝图
app.register_blueprint(get_deviation_bp)
@ -28,11 +23,14 @@ def create_app():
app.register_blueprint(upload_bp)
app.register_blueprint(test_zbparse_bp)
app.register_blueprint(judge_zbfile_bp)
# app.connection_limiters['upload'] = ConnectionLimiter(max_connections=100)
# app.connection_limiters['get_deviation'] = ConnectionLimiter(max_connections=100)
# app.connection_limiters['default'] = ConnectionLimiter(max_connections=100)
# app.connection_limiters['judge_zbfile'] = ConnectionLimiter(max_connections=100)
@app.teardown_request
def check_fd(exception=None):
logger = g.logger
end_fd = os.listdir('/proc/self/fd')
leaked = set(end_fd) - set(g.start_fd)
if leaked:
logger.warning(f"潜在FD泄漏: {len(leaked)}个未关闭")
# @app.teardown_request
# def teardown_request(exception):
# # 接口请求之后都会执行该代码,做一些清理工作

View File

@ -29,6 +29,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
处理 PDF 文件作为 truncate_pdf_main 的后备方法
此函数将在所有模式对都失败后调用
"""
pdf_document=None
try:
try:
pdf_document = PdfReader(pdf_path)
@ -109,6 +110,9 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
except Exception as e:
print(f"Error in extract_pages_twice: {e}")
return ""
finally:
if pdf_document and hasattr(pdf_document, 'close'):
pdf_document.close()
def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):

View File

@ -55,7 +55,6 @@ def get_patterns_for_evaluation_method():
return begin_pattern, end_pattern
def extract_pages_qualification(pdf_path, begin_page, common_header):
pdf_document = PdfReader(pdf_path)
# 开始匹配模式,仅匹配“附录”、“附件”或“附表”
begin_pattern = regex.compile(
r'^(?:附录(?:一|1)?[:]?|附件(?:一|1)?[:]?|附表(?:一|1)?[:]?)(?!的)',
@ -84,65 +83,69 @@ def extract_pages_qualification(pdf_path, begin_page, common_header):
end_page = None
include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"]
exclude_keywords = ["声明函", "承诺函"]
pdf_document = None
try:
# 尝试使用 PyPDF2 读取 PDF
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
# print("使用 PyPDF2 成功读取 PDF 文件。")
except Exception as e_pypdf2:
print(f"extract_pages_qualification:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
try:
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
pdf_document = fitz.open(pdf_path)
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
# print("使用 PyMuPDF 成功读取 PDF 文件。")
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return None, None # 或者根据需求抛出异常
# 从指定的开始页开始遍历
for i in range(begin_page, total_pages):
text = get_text(i)
if text:
cleaned_text = clean_page_content(text, common_header)
# 优先检查是否匹配优先模式
priority_match = priority_pattern.search(cleaned_text)
if priority_match and start_page is None:
start_page = i
print(f"匹配到priority_pattern设置起始页为: {start_page}")
continue
else:
# 如果未匹配优先模式,则按照一般模式进行判断
if (
any(keyword in cleaned_text for keyword in include_keywords) and
all(keyword not in cleaned_text for keyword in exclude_keywords) and
start_page is None
):
if begin_pattern.search(cleaned_text):
start_page = i
print(f"匹配到附录等模式begin_pattern设置起始页为: {start_page}")
continue
# 尝试使用 PyPDF2 读取 PDF
pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
# print("使用 PyPDF2 成功读取 PDF 文件。")
except Exception as e_pypdf2:
print(f"extract_pages_qualification:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
try:
# 如果 PyPDF2 失败,尝试使用 PyMuPDF 读取 PDF
pdf_document = fitz.open(pdf_path)
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
# print("使用 PyMuPDF 成功读取 PDF 文件。")
except Exception as e_fitz:
print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
return None, None # 或者根据需求抛出异常
# 从指定的开始页开始遍历
for i in range(begin_page, total_pages):
text = get_text(i)
if text:
cleaned_text = clean_page_content(text, common_header)
# 优先检查是否匹配优先模式
priority_match = priority_pattern.search(cleaned_text)
if priority_match and start_page is None:
start_page = i
print(f"匹配到priority_pattern设置起始页为: {start_page}")
continue
else:
# 如果未匹配优先模式,则按照一般模式进行判断
if (
any(keyword in cleaned_text for keyword in include_keywords) and
all(keyword not in cleaned_text for keyword in exclude_keywords) and
start_page is None
):
if begin_pattern.search(cleaned_text):
start_page = i
print(f"匹配到附录等模式begin_pattern设置起始页为: {start_page}")
continue
# 确定结束页 - 附录、附件、附表等
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 额外检查当前页面是否不包含任何 include_keywords
if not any(keyword in cleaned_text for keyword in include_keywords):
# 确定结束页 - 附录、附件、附表等
if start_page is not None and end_pattern_attachment.search(cleaned_text):
# 额外检查当前页面是否不包含任何 include_keywords
if not any(keyword in cleaned_text for keyword in include_keywords):
if i > start_page:
end_page = i
print(f"qualification找到结束页 (附件类): {end_page}")
break # 找到结束页后退出循环
else:
print(f"当前页面匹配附件结束模式但包含 include_keywords继续查找。页面: {i}")
# 确定结束页 - 章节标题
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
print(f"qualification找到结束页 (附件类): {end_page}")
print(f"qualification找到结束页 (章节标题): {end_page}")
break # 找到结束页后退出循环
else:
print(f"当前页面匹配附件结束模式但包含 include_keywords继续查找。页面: {i}")
# 确定结束页 - 章节标题
elif start_page is not None and end_pattern_chapter.search(cleaned_text):
if i > start_page:
end_page = i
print(f"qualification找到结束页 (章节标题): {end_page}")
break # 找到结束页后退出循环
return start_page, end_page
return start_page, end_page
finally:
if pdf_document and hasattr(pdf_document, 'close'):
pdf_document.close()
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
@ -322,8 +325,8 @@ if __name__ == "__main__":
pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
# input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf"
# pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
output_folder = r"C:\Users\Administrator\Desktop\货物标\output5"
output_folder = r"C:\Users\Administrator\Desktop\货物标\output4"
# output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
selection = 5 # 例如1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
selection = 4 # 例如1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2与评标办法一致 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 procurement 6-invalid
generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
print(generated_files)