2.14 尝试解决内存泄漏问题

2025-02-14 15:46:07 +08:00 · 2025-02-14 15:46:07 +08:00 · 979d7fac21
commit 979d7fac21
parent cef8ff5415
11 changed files with 387 additions and 328 deletions
--- a/flask_app/general/format_change.py
+++ b/flask_app/general/format_change.py
@ -9,12 +9,10 @@ from flask_app.general.读取文件.clean_pdf import is_scanned_pdf, is_pure_ima
 def download_file(url, local_filename,enable=False):
    """
    下载文件并保存到本地，基于Content-Type设置文件扩展名。
    参数:
    - url (str): 文件的URL地址。
    - local_filename (str): 本地保存的文件名（不含扩展名）。
    - enable: 是否需要判断为扫描型/纯图片文件
    返回:
    - tuple: (完整文件名, 文件类型代码)
             文件类型代码:
@ -31,10 +29,10 @@ def download_file(url, local_filename,enable=False):
            # 获取Content-Type并猜测文件扩展名
            content_type = response.headers.get('Content-Type', '')
            extension = mimetypes.guess_extension(content_type, strict=False) or '.docx'
-
+            extension_lower = extension.lower()             #显式地将 extension 转换为小写（存入 extension_lower），更加健壮
-            # 分离文件名和现有扩展名
+            # 根据传入的 local_filename 构造完整的文件名
-            base, ext = os.path.splitext(local_filename)
+            base, current_ext = os.path.splitext(local_filename)
-            if ext.lower() != extension:
+            if current_ext.lower() != extension_lower:
                full_filename = base + extension
            else:
                full_filename = local_filename
@ -94,6 +92,7 @@ def local_file_2_url(file_path, url):
        print("format_change 文件上传成功")
        receive_file_response = response.content.decode('utf-8')
        receive_file_json = json.loads(receive_file_response)
        print(receive_file_json)
        receive_file_url = receive_file_json["data"]
    else:
@ -207,17 +206,17 @@ def get_pdf_page_count(file_path):
 if __name__ == '__main__':
    # 替换为你的文件路径和API URL
    # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
-    # local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
+    local_path_in = r"C:\Users\Administrator\Downloads\基础公司枞阳经开区新能源汽车零部件产业园基础设施建设项目二期桩基工程劳务分包.doc"
    # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
    # local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
    # local_path_in=r"C:\Users\Administrator\Desktop\fsdownload\457ee03d-c61c-4672-b959-2bbb35a1de29\ztbfile_invalid.pdf"
-    local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
+    # local_path_in = r"C:\Users\Administrator\Desktop\新建文件夹 (3)\ztbfile(1).pdf"
-    intermediate_docx = pdf2docx(local_path_in)
+    # intermediate_docx = pdf2docx(local_path_in)
-    if intermediate_docx:
+    # if intermediate_docx:
-        normal_pdf = docx2pdf(intermediate_docx, force=True)
+    #     normal_pdf = docx2pdf(intermediate_docx, force=True)
    # # downloaded_file=pdf2docx(local_path_in)
-    # downloaded_file=docx2pdf(local_path_in)
+    downloaded_file=docx2pdf(local_path_in)
-    # print(downloaded_file)
+    print(downloaded_file)
    # test_url = "https://bid-assistance.oss-cn-wuhan-lr.aliyuncs.com/test/%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7%E6%B5%8B%E8%AF%95%E4%BF%A1%E5%8F%B7.pdf?Expires=1736852995&OSSAccessKeyId=TMP.3Kg1oKKcsSWb7DXNe4F56bfGfKY5nNWUi274p39HyY7GR3mghMCaFWy69Fi83SBab6PmSkErh4JUD4yAxAGzVVx2hxxoxm&Signature=rmxS5lett4MzWdksDI57EujCklw%3"
    # local_file_name = r'D:\flask_project\flask_app\static\output\output1\1d763771-f25a-4b65-839e-3b2ca56577b1\tmp\ztbfile.pdf'
--- a/flask_app/general/insert_del_pagemark.py
+++ b/flask_app/general/insert_del_pagemark.py
@ -23,36 +23,32 @@ def insert_mark(input_pdf_path):
            # 遍历每一页
            for page_num in range(total_pages):
                # 添加原始页面
                page = pdf_reader.pages[page_num]
                pdf_writer.add_page(page)
-                # 创建一个内存中的PDF，用于存放带有文本的空白页
+                # 使用 BytesIO 的上下文管理器创建内存中的 PDF 页面用于标记
-                packet = BytesIO()
+                with BytesIO() as packet:
                    # 获取当前页面的宽度和高度
                    page_width = float(page.mediabox.width)
                    page_height = float(page.mediabox.height)
-                # 使用reportlab创建一个新的PDF页面
+                    # 使用 reportlab 创建一个新的 PDF 页面，页面大小与当前页面相同
                    c = canvas.Canvas(packet, pagesize=(page_width, page_height))
-                # 计算文本的位置（单位：点，1厘米 ≈ 28.35点）
+                    # 计算文本位置：距左边距 2.3cm，距顶部 0.5cm
                    x_position = 2.3 * cm
-                y_position = page_height - 0.5 * cm  # 从顶部开始计算，因此用页面高度减去上边距
+                    y_position = page_height - 0.5 * cm
-
+                    c.setFont("Helvetica", 12)
                # 绘制文本，使用 (page_num + 1) 作为索引
                c.setFont("Helvetica", 12)  # 设置字体和大小
                    c.drawString(x_position, y_position, f"[$$index_mark_{page_num + 1}$$]")
-                # 完成绘制
+                    # 完成绘制，将内容写入 BytesIO 流
                    c.save()
-                # 将内存中的PDF读入PyPDF2
+                    # 重置流位置，读取刚刚生成的 PDF 页面
                    packet.seek(0)
                    new_pdf = PdfReader(packet)
-                # blank_page = new_pdf.pages[0]
+                    # 使用 deepcopy 确保页面对象独立，避免后续修改影响已添加页面
                    blank_page = copy.deepcopy(new_pdf.pages[0])
                packet.truncate(0)
                packet.seek(0)
                # 将带有文本的空白页添加到写入器
                    pdf_writer.add_page(blank_page)
            # 将所有页面写入输出的PDF文件
@ -65,10 +61,14 @@ def insert_mark(input_pdf_path):
        print(f"发生错误: {e}")
        return input_pdf_path
 def delete_mark(docx_path):
    """
    删除文档中包含标记文本 "[$$index_mark_" 的段落，
    如果该段落后紧跟一个包含 "w:sectPr" 的空白分节符段落，也一并删除。
    修改后的文档保存为 invalid_del.docx，并返回新文件路径。
    """
    try:
-        docx = Document(docx_path)
+        doc = Document(docx_path)
    except KeyError as e:
        print(f"Error opening document: {e}")
        return ""
@ -76,30 +76,41 @@ def delete_mark(docx_path):
        print(f"Invalid package: {e}")
        return ""
-    # 继续处理文档
+    # 收集所有需要删除的段落，避免在遍历时直接修改列表
-    find_flag = False
+    paragraphs_to_remove = []
-    for para in docx.paragraphs:
+    paragraphs = doc.paragraphs
-        # 匹配标记: [$$index_mark_X$$]
+    i = 0
    while i < len(paragraphs):
        para = paragraphs[i]
        if "[$$index_mark_" in para.text:
-            para._element.getparent().remove(para._element)  # 删标记
+            paragraphs_to_remove.append(para)
-            find_flag = True
+            # 如果当前段落后面有段落且该段落的 XML 中包含 "w:sectPr"，也删除该段落
-        if find_flag and "w:sectPr" in para._element.xml:  # 删空白分节符
+            if i + 1 < len(paragraphs):
-            para._element.getparent().remove(para._element)
+                next_para = paragraphs[i + 1]
-            find_flag = False
+                if "w:sectPr" in next_para._element.xml:
-    dir_path = os.path.dirname(docx_path)
+                    paragraphs_to_remove.append(next_para)
-    new_file_path = os.path.join(dir_path, 'invalid_del.docx')
+                    i += 1  # 跳过下一个段落，避免重复处理
        i += 1
-    # 保存修改后的文档
+    # 从文档 XML 中移除收集到的段落
-    docx.save(new_file_path)
+    for para in paragraphs_to_remove:
        parent = para._element.getparent()
        if parent is not None:
            parent.remove(para._element)
    new_file_path = os.path.join(os.path.dirname(docx_path), 'invalid_del.docx')
    doc.save(new_file_path)
    return new_file_path
 if __name__ == '__main__':
    # input=r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\ztbfile.pdf'
-    input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\ztbfile_invalid.pdf'
+    # input=r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\ztbfile_invalid.pdf'
-    output=insert_mark(input)
+    # output=insert_mark(input)
-    # doc_path = r'C:\Users\Administrator\Desktop\fsdownload\0bb9cf31-280c-4d96-bc21-0871ee7fd6df\tmp\invalid_added.docx'
+    doc_path = r'C:\Users\Administrator\Desktop\fsdownload\45645668-be5e-4124-b06a-6abaa9d87d86\tmp\invalid_added.docx'
-    # res=delete_mark(doc_path)
+    res=delete_mark(doc_path)
-    # if res:
+    if res:
-    #     print(res)
+        print(res)
-    # else:
+    else:
-    #     print("No")
+        print("No")
--- a/flask_app/general/llm/通义千问long.py
+++ b/flask_app/general/llm/通义千问long.py
@ -6,9 +6,6 @@ import time
 from pathlib import Path
 from openai import OpenAI
 import os
 from flask_app.general.llm.大模型通用函数 import get_total_tokens
 from flask_app.general.llm.qianwen_turbo import qianwen_turbo
 from flask_app.general.llm.大模型通用函数 import extract_error_details, shared_rate_limit
 file_write_lock = threading.Lock()
--- a/flask_app/general/merge_pdfs.py
+++ b/flask_app/general/merge_pdfs.py
@ -1,8 +1,6 @@
 import os
 import fitz
 from PyPDF2 import PdfReader, PdfWriter
 from flask_app.general.截取pdf通用函数 import create_get_text_function
@ -67,6 +65,7 @@ def merge_with_fitz(paths, output_path):
    for path in paths:
        if not path.strip():
            continue
        pdf_doc = None
        try:
            pdf_doc = fitz.open(path)
            get_text = create_get_text_function('fitz', pdf_doc)
@ -88,7 +87,9 @@ def merge_with_fitz(paths, output_path):
            pdf_doc.close()
        except Exception as e:
            print(f"文件 '{path}' 无法使用 fitz 处理，错误: {e}")
-            continue
+        finally:
            if pdf_doc is not None:
                pdf_doc.close()
    if merged_pdf.page_count == 0:
        merged_pdf.close()
--- a/flask_app/general/截取pdf通用函数.py
+++ b/flask_app/general/截取pdf通用函数.py
@ -1,5 +1,6 @@
 # flask_app/general/截取pdf通用函数.py
 import concurrent.futures
 import io
 import os
 import fitz
@ -37,8 +38,10 @@ def get_start_and_common_header(input_path, end_page):
    - common_header: 公共页眉内容。
    - last_begin_index: 起始页的页码索引。
    """
    pdf_document = None
    try:
        # 提取公共页眉
        common_header = extract_common_header(input_path)  # 假设该函数已定义
        last_begin_index = -1
@ -48,7 +51,6 @@ def get_start_and_common_header(input_path, end_page):
        )
        # 新增目录匹配模式
        catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
        try:
            # 尝试使用 PyPDF2 读取 PDF
            pdf_document = PdfReader(input_path)
@ -86,58 +88,77 @@ def get_start_and_common_header(input_path, end_page):
    except Exception as e:
        print(f"Error in get_start_and_common_header: {e}")
        return "", -1  # 根据需求调整返回值
    finally:
        # 如果使用的是 PyMuPDF，记得关闭文档
        if pdf_document and hasattr(pdf_document, "close"):
            pdf_document.close()
 def check_pdf_pages(pdf_path, mode, logger):
    pdf_document = None
    try:
        try:
            # 尝试使用 PyPDF2 读取 PDF（新版可直接传入文件路径）
            pdf_document = PdfReader(pdf_path)
            num_pages = len(pdf_document.pages)
            # logger.info(f"使用 PyPDF2 成功读取 PDF '{pdf_path}' 的总页数为: {num_pages}")
        except Exception as e_pypdf2:
-            print(f"check_pdf_pages:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
+            logger.warning(f"check_pdf_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
            # 如果 PyPDF2 读取失败，尝试使用 PyMuPDF 读取 PDF
            pdf_document = fitz.open(pdf_path)
            num_pages = pdf_document.page_count
        if num_pages <= 30:
            logger.info("PDF（invalid_path）页数小于或等于30页，跳过切分逻辑。")
            if mode == 'goods':
                return True, ['', '', '', '', '', '', pdf_path, '']
            else:
                return True, ['', '', '', '', '', pdf_path, '']
-        # 若页数大于30页，返回None表示继续处理
+
        # 若页数大于30页，返回 False 表示继续处理
        return False, []
    except Exception as e:
        logger.error(f"无法读取 PDF 页数: {e}")
        # 返回空列表意味着无法执行后续处理逻辑
        if mode == 'goods':
            return True, ['', '', '', '', '', '', pdf_path, '']
        else:
            return True, ['', '', '', '', '', pdf_path, '']
    finally:
        # 如果使用的是 PyMuPDF 或其他需要关闭的对象，确保关闭资源
        if pdf_document and hasattr(pdf_document, "close"):
            pdf_document.close()
 def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header):
    pdf_document = None
    using_pypdf2 = False
    try:
        if start_page is None or end_page is None:
            print("错误: start_page 或 end_page 为 None")
            return ""
-        # 尝试使用 PyPDF2 读取 PDF
+        # 尝试使用 PyPDF2 读取 PDF（新版可以直接传入路径）
        try:
            pdf_document = PdfReader(pdf_path)
            total_pages = len(pdf_document.pages)
-            get_text = lambda p: pdf_document.pages[p].extract_text()
+                # 定义提取文本函数
            get_text = create_get_text_function('pypdf2', pdf_document)
            using_pypdf2 = True
                # print("使用 PyPDF2 成功读取 PDF 文件。")
        except Exception as e_pypdf2:
            print(f"save_extracted_pages: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
-            # 尝试使用 PyMuPDF 读取 PDF
+            # 如果 PyPDF2 失败，尝试使用 PyMuPDF 读取 PDF
            try:
                pdf_document = fitz.open(pdf_path)
                total_pages = pdf_document.page_count
-                get_text = lambda p: pdf_document.load_page(p).get_text()
+                get_text = create_get_text_function('fitz', pdf_document)
                using_pypdf2 = False
                # print("使用 PyMuPDF 成功读取 PDF 文件。")
            except Exception as e_fitz:
-                print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
+                print(f"save_extracted_pages: 使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
                return ""
        # 检查页面范围的有效性
@ -148,6 +169,7 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
        output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
        # 如果 output_suffix 为 'notice' 并且 start_page > 0，则另存“目录前”的页面
        if output_suffix == 'notice' and start_page > 0:
            before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
            toc_page = -1
@ -158,6 +180,7 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
                except Exception as e_extract:
                    print(f"提取第 {page_num} 页文本时出错: {e_extract}")
                    continue
                cleaned_text = clean_page_content(text, common_header)
                if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE):
                    toc_page = page_num
@ -204,8 +227,8 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
                    print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
                    continue
            try:
-                with open(output_pdf_path, 'wb') as f:
+                with open(output_pdf_path, 'wb') as f_out:
-                    output_doc.write(f)
+                    output_doc.write(f_out)
                print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
                return output_pdf_path
            except Exception as e_write:
@ -213,24 +236,28 @@ def save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_s
                return ""
        else:
            output_doc = fitz.open()
            try:
                for page_num in range(start_page, end_page + 1):
                    try:
                        output_doc.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
                    except Exception as e_page:
                        print(f"添加第 {page_num} 页到 output_doc 时出错: {e_page}")
                        continue
            try:
                output_doc.save(output_pdf_path)
                output_doc.close()
                print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}")
                return output_pdf_path
            except Exception as e_write:
                print(f"保存截取的页面时出错: {e_write}")
                return ""
            finally:
                output_doc.close()
    except Exception as e:
        print(f"发生未知错误: {e}")
        return ""
-
+    finally:
        # 如果使用的是 PyMuPDF 或其他需要关闭的对象，确保关闭资源
        if pdf_document and not using_pypdf2 and hasattr(pdf_document, "close"):
            pdf_document.close()
 def is_pdf_or_doc(filename):
    # 判断文件是否为PDF或Word文档
@ -242,14 +269,14 @@ def convert_to_pdf(file_path):
        return docx2pdf(file_path)
    return file_path
-def get_invalid_file(file_path, output_folder, common_header, begin_page):
+def get_invalid_file(pdf_path, output_folder, common_header, begin_page):
    """
    从指定的PDF文件中提取无效部分并保存到输出文件夹中。
    begin_pattern匹配开头  end_pattern匹配结尾
    页数小于100不进行begin_pattern，默认start_page=0
    页数大于200时end_pattern从前往后匹配
    Args:
-        file_path (str): 输入的PDF文件路径。
+        pdf_path (str): 输入的PDF文件路径。
        output_folder (str): 提取后文件的输出文件夹路径。
        common_header (str): 公共头部文本，用于清理每页的内容。
@ -284,23 +311,22 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
    exclusion_pattern = regex.compile(
        r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
    pdf_document = None
    try:
-        # 打开PDF文件
+        # 尝试使用 PyPDF2 读取 PDF
        try:
-            pdf_document = PdfReader(file_path)
+            pdf_document = PdfReader(pdf_path)
-            pdf_lib = 'pypdf2'
+            get_text = create_get_text_function('pypdf2', pdf_document)
            # 获取总页数（对于 pypdf2 的 PDFDocument）
            total_pages = len(pdf_document.pages)
        except Exception as e:
-            print(f"get_invalid_file:使用 pypdf2 读取失败，错误信息: {e}，切换至 fitz。")
+            pdf_document = fitz.open(pdf_path)
-            pdf_document = fitz.open(file_path)
+            get_text = create_get_text_function('fitz', pdf_document)
            pdf_lib = 'fitz'
            # 获取总页数（对于 fitz 的 PDFDocument）
            total_pages = pdf_document.page_count
        if total_pages <= 50:
-            print(f"PDF页数({total_pages}) <= 50，直接返回文件路径:{file_path}")
+            print(f"PDF页数({total_pages}) <= 50，直接返回文件路径: {pdf_path}")
-            return file_path, 0
+            return pdf_path, 0
-        get_text = create_get_text_function(pdf_lib, pdf_document)
+
        # 提取并清理每页的文本内容
        page_texts = []
        for i in range(total_pages):
@ -308,11 +334,12 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
            cleaned_text = clean_page_content(text, common_header) if text else ""
            page_texts.append(cleaned_text)
-        # 定义查找起始页的函数
+        # 内部函数：查找起始页
        def find_start_page(begin_page):
-            for pattern in begin_patterns:
+            for pattern in begin_patterns:  # 需提前定义好
                for i in range(begin_page, min(begin_page + 30, total_pages)):
                    text = page_texts[i]
                    # 若存在排除模式则跳过
                    if regex.search(exclusion_pattern, text):
                        continue
                    if regex.search(pattern, text):
@ -321,22 +348,25 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
                # print(f"未在前30页找到模式: {pattern.pattern}")
            return 0  # 默认从第一页开始
-        # 定义查找结束页的函数
+        # 内部函数：查找结束页
        def find_end_page():
            if total_pages > 200:
-                # print("总页数大于200页，结束页从前往后查找，跳过前30页。") #且先匹配合同
+                # 当页数较多时，从第31页开始正向查找结束模式
-                for pattern in end_patterns:
+                for pattern in end_patterns:  # 需提前定义好
                    for i in range(30, total_pages):
                        text = page_texts[i]
-                        exclusion_pattern = regex.compile(
+                        # 定义局部的排除模式
-                            r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$', regex.MULTILINE)
+                        exclusion_pat = regex.compile(
-                        if regex.search(exclusion_pattern,text):
+                            r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制|评审要素|评审标准)\s*$',
                            regex.MULTILINE
                        )
                        if regex.search(exclusion_pat, text):
                            continue
                        if regex.search(pattern, text):
                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
                            return i
            else:
-                # print("结束页从后往前查找，确保只剩前30页时停止。")
+                # 当页数较少时，从后向前查找（至少保留前30页）
                for pattern in end_patterns:
                    for i in range(total_pages - 1, 29, -1):
                        text = page_texts[i]
@ -344,77 +374,76 @@ def get_invalid_file(file_path, output_folder, common_header, begin_page):
                            # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}")
                            return i
-            # 如果没有匹配到，设置end_page逻辑
+            # 若未匹配到结束模式，设置默认结束页
            if total_pages > 200:
-                print(f"未找到结束模式，总页数大于200，平滑调整结束页为 {end_page}。" + file_path)
+                print(f"未找到结束模式，总页数大于200，平滑调整结束页为 {max(100, int(total_pages * 1 / 2))}。{pdf_path}")
                return max(100, int(total_pages * 1 / 2))
            elif 100 < total_pages <= 200:
-                print("未找到结束模式，总页数在100到200之间，设置结束页为总页数的三分之二。" + file_path)
+                print("未找到结束模式，总页数在100到200之间，设置结束页为总页数的三分之二。 " + pdf_path)
                return max(100, int(total_pages * 2 / 3))
            else:
-                print("未找到结束模式，设置结束页为最后一页。" + file_path)
+                print("未找到结束模式，设置结束页为最后一页。 " + pdf_path)
                return total_pages - 1
-        # 根据总页数决定是否查找起始页
+        # 根据总页数决定是否并发查找起始页与结束页
        if total_pages < 100:
-            # print("总页数少于100页，默认起始页为第一页。")
+            start_page_found = 0
            start_page = 0
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future_end = executor.submit(find_end_page)
-                end_page = future_end.result()
+                end_page_found = future_end.result()
        else:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                future_start = executor.submit(find_start_page, begin_page)
                future_end = executor.submit(find_end_page)
-                start_page = future_start.result()
+                start_page_found = future_start.result()
-                end_page = future_end.result()
+                end_page_found = future_end.result()
-        # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}")
+
-        # 验证页码范围
+        # 检查页码范围是否有效
-        if start_page > end_page:
+        if start_page_found > end_page_found:
-            print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})")
+            print(f"无效的页码范围: 起始页 ({start_page_found + 1}) > 结束页 ({end_page_found + 1})")
            return "", 0
-        # 调用已实现的保存函数
+        # 调用已实现的保存函数，保存提取的页面
        output_path = save_extracted_pages(
-            pdf_path=file_path,
+            pdf_path=pdf_path,
            output_folder=output_folder,
-            start_page=start_page,
+            start_page=start_page_found,
-            end_page=end_page,
+            end_page=end_page_found,
            output_suffix="invalid",
            common_header=common_header
        )
-
+        return output_path, end_page_found
        # print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}")
        return output_path, end_page
    except Exception as e:
-        print(f"处理文件 {file_path} 时发生错误: {e}")
+        print(f"处理文件 {pdf_path} 时发生错误: {e}")
        return "", 0
    finally:
        # 确保释放资源（对 fitz 的 pdf_document 需要关闭）
        if pdf_document and hasattr(pdf_document, "close"):
            pdf_document.close()
 def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
                          output_suffix="normal", invalid_endpage=-1):
    start_page = None
    end_page = None
    flag = True
-
+    # 先尝试使用 PyPDF2
    try:
        # 尝试使用 PyPDF2 读取 PDF
        pdf_document = PdfReader(pdf_path)
        total_pages = len(pdf_document.pages)
        get_text = create_get_text_function('pypdf2', pdf_document)
        # print("使用 PyPDF2 成功读取 PDF 文件。")
    except Exception as e_pypdf2:
        print(f"extract_pages_generic:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
        # 如果 PyPDF2 失败，就再试 PyMuPDF
        try:
            # 如果 PyPDF2 失败，尝试使用 PyMuPDF 读取 PDF
            pdf_document = fitz.open(pdf_path)
            total_pages = pdf_document.page_count
            get_text = create_get_text_function('fitz', pdf_document)
            # print("使用 PyMuPDF 成功读取 PDF 文件。")
        except Exception as e_fitz:
            print(f"使用 PyMuPDF 读取 PDF 也失败: {e_fitz}")
-            return None, None  # 或者根据需求抛出异常
+            return None, None
    end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
    for i in range(begin_page, end_limit):
        text = get_text(i)
@ -447,7 +476,9 @@ def extract_pages_generic(pdf_path, begin_pattern, end_pattern, begin_page, comm
                    if regex.search(end_pattern, cleaned_text) and not regex.search(begin_pattern, cleaned_text):
                        end_page = i
                        break
-
+        # 如果使用的是 PyMuPDF，需要手动关闭文档
    if pdf_document and isinstance(pdf_document, fitz.Document):
        pdf_document.close()
    return start_page, end_page
@ -534,7 +565,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
    """
    exclusion_pattern = regex.compile(
        r'(文件的构成|文件的组成|文件组成|文件构成|文件的编制|文件编制)\s*$', regex.MULTILINE)
-    def run_extraction(pdf_document,begin_pattern, extraction_stage='first'):
+    def run_extraction(begin_pattern, extraction_stage='first'):
        """
        使用提供的 begin_pattern 运行提取过程。
        根据 extraction_stage 判断是“第一次提取”还是“第三次提取”，
@ -554,14 +585,11 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
        end_page = None
        combined_mid_pattern = None  # 中间页的组合模式
        catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
        total_pages = len(pdf_document.pages) if isinstance(pdf_document, PdfReader) else pdf_document.page_count
        end_limit = total_pages if invalid_endpage == -1 else min(invalid_endpage, total_pages)
        # ===== 遍历每一页，执行提取逻辑 =====
        for i in range(end_limit):
            try:
-                text = pdf_document.pages[i].extract_text() if isinstance(pdf_document,
+                text = get_text(i)
                                                                          PdfReader) else pdf_document.load_page(i).get_text()
                text = text or ""
                cleaned_text = clean_page_content(text, common_header)
            except Exception as e:
                print(f"提取第 {i} 页文本时出错: {e}")
@ -621,13 +649,17 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
            path2 = path1
        return path1, path2
    pdf_document = None
    try:
        try:
            pdf_document = PdfReader(pdf_path)
            get_text = create_get_text_function('pypdf2', pdf_document)
            total_pages = len(pdf_document.pages)
        except Exception as e_pypdf2:
            print(f"extract_pages_tobidders_notice:使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
            try:
                pdf_document = fitz.open(pdf_path)
                get_text = create_get_text_function('fitz', pdf_document)
                total_pages = pdf_document.page_count
                # print("使用 PyMuPDF 成功读取 PDF 文件。")
            except Exception as e_fitz:
@ -641,7 +673,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
            r'(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知+\s*$',
            regex.MULTILINE
        )
-    start_page, mid_page, end_page = run_extraction(pdf_document,begin_pattern, extraction_stage='first')
+        start_page, mid_page, end_page = run_extraction(begin_pattern, extraction_stage='first')
        # 如果第一次提取成功，保存并返回路径
        if start_page is not None and mid_page is not None and end_page is not None:
            return perform_extraction_and_save(start_page, mid_page, end_page)
@ -664,17 +696,30 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_page, common_h
            r'(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!与\s*)(?<!同\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|谈判|供应商|应答人|比选申请人).*须知前附表\s*$',
            regex.MULTILINE
        )
-    start_page, mid_page, end_page = run_extraction(pdf_document, new_begin_pattern, extraction_stage='third')
+        start_page, mid_page, end_page = run_extraction(new_begin_pattern, extraction_stage='third')
        if start_page is not None and mid_page is not None and end_page is not None:
            return perform_extraction_and_save(start_page, mid_page, end_page)
        else:
            # 所有提取尝试均失败
            print(f"三次提取tobidders_notice均失败!! {pdf_path}")
            return "", ""
    except Exception as e:
        print(f"处理文件 {pdf_path} 时发生错误: {e}")
        return "", ""
    finally:
        # 无论如何都关闭 pdf_document（对于 fitz 对象）
        if pdf_document and hasattr(pdf_document, "close"):
            pdf_document.close()
 def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
    # 使用 with open 确保文件流在读取后关闭，避免文件句柄泄露
    try:
        pdf_document = PdfReader(pdf_path)
    except Exception as e:
        print(f"读取 PDF 文件失败: {e}")
        return None, None, None, None
    # 投标须知前附表占一章、投标须知正文占一章
    output_suffix = "tobidders_notice"
    begin_pattern = regex.compile(
@ -696,16 +741,13 @@ def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
    if start_page1 is None or end_page1 is None:
        return None, None, None,None
-    # 提取第二部分
+    # 提取第二部分 先检查end_page1页面的内容
-
+    text = get_text_pypdf2(pdf_document,end_page1)
    # 检查end_page1页面的内容
    text = pdf_document.pages[end_page1].extract_text() or ""
    cleaned_text = clean_page_content(text, common_header)
    match = end_pattern.search(cleaned_text)
    if match:
        # 获取匹配到的中文部分
-        chapter_title = match.group(1)
+        chapter_title = match.group(1) if match.groups() else ""
        # 检查是否包含排除关键词
        if any(word in chapter_title for word in exclusion_words):
            # 如果包含排除关键词，直接返回相同的路径
@ -722,7 +764,6 @@ def extract_pages_twice_tobidders_notice(pdf_path, common_header, begin_page):
    return start_page1, end_page1, start_page2,end_page2
 #return start_page1, end_page1, end_page1
 def get_notice(pdf_path, output_folder, begin_page, common_header,invalid_endpage):
    begin_pattern = regex.compile(
        r'.*(?<!对应\s*)(?<!根据\s*)(?<!按照\s*)(?<!见\s*)(?<!同\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请|采购公告)[\)）]?\s*$',
--- a/flask_app/general/无效标和废标公共代码.py
+++ b/flask_app/general/无效标和废标公共代码.py
@ -563,12 +563,8 @@ def handle_query(file_path, user_query, output_file, result_key, keywords):
            else:
                user_query = generate_full_user_query(output_file, user_query)
                model_ans = qianwen_plus(user_query)  # 豆包模型返回结果
-                # file_id = upload_file(output_file)
+                num_list = process_string_list(model_ans)  # 处理模型返回的序号列表
                # model_ans = qianwen_long(file_id, user_query)
                num_list = process_string_list(model_ans)  # 处理模型返回的序号
                print(result_key + "选中的序号:" + str(num_list))
            # print(all_texts1_list)
            # print(all_text1_items)
            for index in num_list:
                if 1 <= index <= len(all_texts1_list):
                    original_global_idx = all_text1_items[index - 1][0]
--- a/flask_app/general/读取文件/clean_pdf.py
+++ b/flask_app/general/读取文件/clean_pdf.py
@ -3,6 +3,8 @@ import threading
 import fitz
 from PyPDF2 import PdfReader
 from docx import Document
 import queue
 from queue import Queue, Empty
 def extract_common_header(pdf_path):
    """
    提取 PDF 文件的公共页眉。
@ -181,36 +183,60 @@ def clean_page_content(text, common_header):
    text = re.sub(r'^\s*[—-]\s*(第\s*)?\d{1,3}\s*(页)?\s*[—-]\s*', '', text)  # 删除形如 '—2—', '-2-', 或 '-第2页-' 的页码
    return text
 def read_page_text_with_timeout(page, timeout=3):
    """
-    尝试在子线程里执行 page.extract_text()。
+    线程安全的带超时文本提取（改进版）
-    如果超过 `timeout` 秒没有完成，就返回 None，表示超时。
+    特性：
-    如果正常完成，就返回提取到的文本。
+    1. 使用Queue保证线程间通信安全
    2. 添加双重超时控制
    3. 完善的资源清理机制
    """
-    done = threading.Event()
+    result_queue = Queue(maxsize=1)
-    result = []
+    done_flag = threading.Event()
-    def wrapper():
+    def extract_worker():
        try:
-            txt = page.extract_text()  # 可能会卡住的操作
+            # 强制设置文本提取上限（防止底层库卡死）
-            result.append(txt)
+            txt = page.extract_text() or ""
            result_queue.put(txt)
        except Exception as e:
-            print("提取文本时出错：", e)
+            print(f"文本提取异常: {str(e)}")
-            result.append("")
+            result_queue.put("")
        finally:
-            done.set()
+            done_flag.set()  # 确保事件触发
-    # 启动后台线程
+    # 启动工作线程
-    thread = threading.Thread(target=wrapper, daemon=True)
+    worker = threading.Thread(
-    thread.start()
+        target=extract_worker,
        daemon=True,
        name=f"PDFTextExtract-{id(page)}"
    )
    worker.start()
-    # 等待提取结果，超过 timeout 秒则视为超时
+    # 双重等待机制
-    finished_in_time = done.wait(timeout)
+    try:
-    if not finished_in_time:
+        # 第一阶段：等待事件标志
-        print(f"单页文本提取超时（超过 {timeout} 秒）！")
+        if done_flag.wait(timeout):
            # 第二阶段：立即获取结果
            return result_queue.get_nowait()
        # 超时处理流程
        print(f"文本提取超时（{timeout}s），终止操作")
        return None
-    return result[0] if result else ""
+    except Empty:
        # 罕见情况：事件触发但队列无数据
        print("队列数据异常，返回空文本")
        return ""
    finally:
        # 资源清理
        if worker.is_alive():
            try:
                worker._stop()  # 强制终止（慎用）
            except Exception:
                pass
 def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
    """
@ -220,58 +246,43 @@ def is_scanned_pdf(file_path, max_pages=15, page_timeout=3, overall_timeout=30):
    3. 如果前 max_pages 页都检测完成，均无可见文本，则返回 True（认为是扫描件）。
    4. 如果需要整体超时（overall_timeout），则在最外层加一个封装线程进行控制。
    """
-    def core_check(result_container, done_event):
+    result_queue = queue.Queue()
-        """
+    done_event = threading.Event()
-        真正的核心逻辑，执行完后把结果塞进 result_container[0]，
+    def core_check():
        然后调用 done_event.set() 告知“整个检查流程结束”。
        """
        try:
            with open(file_path, 'rb') as f:
                reader = PdfReader(f)
                for i, page in enumerate(reader.pages):
-                    if i >= max_pages:
+                    if i >= max_pages or done_event.is_set():
                        break
                    # 尝试在 page_timeout 内获取文本
                    text = read_page_text_with_timeout(page, page_timeout)
                    if text is None:
-                        print(f"第 {i+1} 页文本提取超时，直接判定为扫描件。")
+                        print(f"Page {i + 1} timeout")
-                        result_container[0] = True
+                        result_queue.put(True)
                        done_event.set()
                        return
                    if text.strip():
-                        print(f"第 {i+1} 页检测到文本，判定为非扫描件。")
+                        print(f"Text found on page {i + 1}")
-                        result_container[0] = False
+                        result_queue.put(False)
                        done_event.set()
                        return
                result_queue.put(True)
        except Exception as e:
-            print("处理 PDF 文件时发生异常：", e)
+            print(f"Processing error: {str(e)}")
-            result_container[0] = True
+            result_queue.put(True)
-            done_event.set()
+        finally:
            return
        print(f"前 {max_pages} 页均未检测到文本，判定为扫描件。")
        result_container[0] = True
            done_event.set()
-    result_container = [None]  # 用于在子线程中传递结果
+    thread = threading.Thread(target=core_check, daemon=True)
    done = threading.Event()
    thread = threading.Thread(target=core_check, args=(result_container, done), daemon=True)
    thread.start()
    try:
        result = result_queue.get(timeout=overall_timeout)
    except queue.Empty:
        print(f"Overall timeout after {overall_timeout}s")
        return True  # 或定义特殊超时状态码
-    # 等待整体流程结束，最多等待 overall_timeout 秒
+    return result
    finished_in_time = done.wait(overall_timeout)
    if not finished_in_time:
        print(f"整体检查超时（超过 {overall_timeout} 秒），返回默认结果。")
        return True  # 或者根据需要返回其它默认值
    # 打印最终结果调试信息
    if result_container[0]:
        print("最终结果：认为是扫描件（未检测到有效文本或发生单页超时）")
    else:
        print("最终结果：认为非扫描件（检测到有效文本）")
    return result_container[0]
 def is_pure_image(docx_path, percentage=0.3):
    """
--- a/flask_app/routes/判断是否是招标文件.py
+++ b/flask_app/routes/判断是否是招标文件.py
@ -12,7 +12,9 @@ def judge_zbfile_exec(file_path):
        start_time = time.time()
        # 检查文件是否为PDF格式
        if file_path.lower().endswith('.pdf'):
-            reader = PdfReader(file_path)
+            # 使用 with 语句确保文件关闭
            with open(file_path, 'rb') as f:
                reader = PdfReader(f)
                num_pages = len(reader.pages)
            if num_pages <= 5:
                return False
@ -32,11 +34,7 @@ def judge_zbfile_exec(file_path):
        print(f"judge_zbfile_exec实际耗时：{end_time - start_time:.2f} 秒")
        print(f"判断是否属于招标文件：{model_res}")
-        # 根据模型返回结果判断
+        return '否' not in model_res
        if '否' in model_res:
            return False
        else:
            return True
    except Exception as e:
        print(f"处理文件时出错: {e}")
--- a/flask_app/start_up.py
+++ b/flask_app/start_up.py
@ -1,4 +1,5 @@
 # flask_app/start_up.py
 import os
 from flask import Flask, g
 from flask_app.ConnectionLimiter import ConnectionLimiter
@ -11,16 +12,10 @@ from flask_app.general.llm.清除file_id import delete_file_by_ids,read_file_ids
 from flask_app.routes.judge_zbfile import judge_zbfile_bp
 class FlaskAppWithLimiter(Flask):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # 初始化一个字典来存储每个蓝图的限流器
        self.connection_limiters = {}
 def create_app():
    app = FlaskAppWithLimiter(__name__)
    # 创建全局日志记录器
    app = Flask(__name__)
    app.global_logger = create_logger_main('global_log')  # 全局日志记录器
    # 注册蓝图
    app.register_blueprint(get_deviation_bp)
@ -28,11 +23,14 @@ def create_app():
    app.register_blueprint(upload_bp)
    app.register_blueprint(test_zbparse_bp)
    app.register_blueprint(judge_zbfile_bp)
    # app.connection_limiters['upload'] = ConnectionLimiter(max_connections=100)
    # app.connection_limiters['get_deviation'] = ConnectionLimiter(max_connections=100)
    # app.connection_limiters['default'] = ConnectionLimiter(max_connections=100)
    # app.connection_limiters['judge_zbfile'] = ConnectionLimiter(max_connections=100)
    @app.teardown_request
    def check_fd(exception=None):
        logger = g.logger
        end_fd = os.listdir('/proc/self/fd')
        leaked = set(end_fd) - set(g.start_fd)
        if leaked:
            logger.warning(f"潜在FD泄漏: {len(leaked)}个未关闭")
    # @app.teardown_request
    # def teardown_request(exception):
    #     # 接口请求之后都会执行该代码，做一些清理工作
--- a/flask_app/工程标/截取pdf工程标版.py
+++ b/flask_app/工程标/截取pdf工程标版.py
@ -29,6 +29,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
    处理 PDF 文件，作为 truncate_pdf_main 的后备方法。
    此函数将在所有模式对都失败后调用。
    """
    pdf_document=None
    try:
        try:
            pdf_document = PdfReader(pdf_path)
@ -109,6 +110,9 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
    except Exception as e:
        print(f"Error in extract_pages_twice: {e}")
        return ""
    finally:
        if pdf_document and hasattr(pdf_document, 'close'):
            pdf_document.close()
 def truncate_pdf_main_engineering(input_path, output_folder, selection, logger, output_suffix="default",invalid_endpage=-1):
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -55,7 +55,6 @@ def get_patterns_for_evaluation_method():
    return begin_pattern, end_pattern
 def extract_pages_qualification(pdf_path, begin_page, common_header):
    pdf_document = PdfReader(pdf_path)
    # 开始匹配模式，仅匹配“附录”、“附件”或“附表”
    begin_pattern = regex.compile(
        r'^(?:附录(?:一|1)?[：:]?|附件(?:一|1)?[：:]?|附表(?:一|1)?[：:]?)(?!的)',
@ -84,7 +83,8 @@ def extract_pages_qualification(pdf_path, begin_page, common_header):
    end_page = None
    include_keywords = ["资格审查", "资质审查", "符合性审查","资格性审查", "符合性检查","资格性检查", "资格检查"]
    exclude_keywords = ["声明函", "承诺函"]
-
+    pdf_document = None
    try:
        try:
            # 尝试使用 PyPDF2 读取 PDF
            pdf_document = PdfReader(pdf_path)
@ -143,6 +143,9 @@ def extract_pages_qualification(pdf_path, begin_page, common_header):
                        print(f"qualification：找到结束页 (章节标题): {end_page}")
                        break  # 找到结束页后退出循环
        return start_page, end_page
    finally:
        if pdf_document and hasattr(pdf_document, 'close'):
            pdf_document.close()
 def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger):
@ -322,8 +325,8 @@ if __name__ == "__main__":
    pdf_path=r"C:\Users\Administrator\Desktop\fsdownload\2c3e2291-6804-4ef0-b4a8-6f457edd5709\ztbfile.pdf"
    # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
    # pdf_path=r"C:\Users\Administrator\Desktop\文件解析问题\文件解析问题\1414cb9c-7bf4-401c-8761-2acde151b9c2\ztbfile.pdf"
-    output_folder = r"C:\Users\Administrator\Desktop\货物标\output5"
+    output_folder = r"C:\Users\Administrator\Desktop\货物标\output4"
    # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
-    selection = 5  # 例如：1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 procurement  6-invalid
+    selection = 4  # 例如：1 - 公告 notice , 2 - 评标办法 evaluation_method, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求 procurement  6-invalid
    generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection,logger)
    print(generated_files)