From cb153889a4363eb7cc7743969c235746a76efc24 Mon Sep 17 00:00:00 2001
From: zy123 <646228430@qq.com>
Date: Tue, 17 Dec 2024 18:44:58 +0800
Subject: [PATCH] =?UTF-8?q?12.17=20=E6=97=A0=E6=95=88=E6=8A=95=E6=A0=87?=
 =?UTF-8?q?=E3=80=81=E5=BA=9F=E6=A0=87?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 flask_app/general/merge_pdfs.py               | 202 ++++----------
 flask_app/general/截取pdf_main.py             | 127 +++++++++
 flask_app/general/截取pdf通用函数.py          |  57 ++++
 flask_app/general/通用功能函数.py             |  11 +-
 flask_app/old_version/形式响应评审old.py      |   2 +-
 flask_app/old_version/招标文件解析_old.py     |   2 +-
 flask_app/old_version/解析old_old.py          |   2 +-
 flask_app/routes/偏离表main.py                |  17 +-
 flask_app/routes/小解析main.py                |  26 +-
 flask_app/routes/工程标解析main.py            |  28 +-
 flask_app/routes/货物标解析main.py            |  13 +-
 .../工程标/{截取pdf.py => 截取pdf工程标版.py} | 257 ++++--------------
 flask_app/货物标/截取pdf货物标版.py           | 230 ++--------------
 13 files changed, 357 insertions(+), 617 deletions(-)
 create mode 100644 flask_app/general/截取pdf_main.py
 create mode 100644 flask_app/general/截取pdf通用函数.py
 rename flask_app/工程标/{截取pdf.py => 截取pdf工程标版.py} (69%)

diff --git a/flask_app/general/merge_pdfs.py b/flask_app/general/merge_pdfs.py
index 513b875..c6db9e9 100644
--- a/flask_app/general/merge_pdfs.py
+++ b/flask_app/general/merge_pdfs.py
@@ -91,21 +91,20 @@ def find_and_merge(target_path, output_suffix):
         paths=[target_path,full_path]
     merge_pdfs(paths,target_path)
 
-def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_path, base_file_name):
+def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name, mode='engineering'):
     """
-    合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件，
-    以及 truncate_files 中以指定后缀结尾的文件，按照指定顺序合并。
+    通用的 PDF 合并函数，根据不同的模式合并指定的文件。
 
     参数：
     - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。
     - truncate_files (list): 包含 PDF 文件路径的列表。
     - output_path (str): 合并后的 PDF 文件保存路径。
     - base_file_name (str): 用于匹配文件名的基础名称。
+    - mode (str): 合并模式，支持 'engineering' 和 'goods'。
 
     返回:
-    - str: 合并后的 PDF 文件路径，如果未找到所有需要合并的文件则返回 ""。
+    - str: 如果合并成功，返回 output_path；否则，返回空字符串 ""。
     """
-    # 1. 获取 output_folder 中所有文件
     try:
         all_output_files = os.listdir(output_folder)
     except FileNotFoundError:
@@ -115,50 +114,82 @@ def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_pa
         print(f"没有权限访问输出文件夹 '{output_folder}'。")
         return ""
 
-    # 2. 定义要选择的文件后缀及合并顺序，包括 before 文件
-    desired_suffixes = [
-        f'{base_file_name}_before.pdf',
-        f'{base_file_name}_notice.pdf',
-        f'{base_file_name}_tobidders_notice_table.pdf'
-    ]
+    if mode == 'engineering':
+        required_suffixes = [
+            f'{base_file_name}_before.pdf',
+            f'{base_file_name}_notice.pdf',
+            f'{base_file_name}_tobidders_notice_table.pdf'
+        ]
+        optional_suffixes = []
+    elif mode == 'goods':
+        required_suffixes = [
+            f'{base_file_name}_before.pdf',
+            f'{base_file_name}_notice.pdf',
+            f'{base_file_name}_tobidders_notice_part1.pdf'
+        ]
+        optional_suffixes = [
+            f'{base_file_name}_tobidders_notice_part2.pdf'
+        ]
+    else:
+        print(f"未知的合并模式: {mode}")
+        return ""
 
     all_pdfs_to_merge = []
-    missing_files = []  # 用于记录缺失的文件
+    missing_files = []
 
-    for suffix in desired_suffixes:
+    # 处理必需的文件
+    for suffix in required_suffixes:
         if suffix == f'{base_file_name}_before.pdf':
-            # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
             matching_files = [
                 os.path.join(output_folder, f)
                 for f in all_output_files
                 if f.endswith(suffix)
             ]
         else:
-            # 从 truncate_files 中选择以指定后缀结尾的文件
             matching_files = [f for f in truncate_files if f.endswith(suffix)]
 
         if matching_files:
-            # 如果找到多个匹配的文件，按名称排序并添加
             matching_files_sorted = sorted(matching_files)
             all_pdfs_to_merge.extend(matching_files_sorted)
             for f in matching_files_sorted:
                 print(f"选中文件: {f}")
         else:
             print(f"没有找到以 '{suffix}' 结尾的文件。")
-            missing_files.append(suffix)  # 记录缺失的文件
+            missing_files.append(suffix)
 
-    # 检查是否所有需要的文件都已找到
+    # 处理可选的文件
+    for suffix in optional_suffixes:
+        if suffix == f'{base_file_name}_before.pdf':
+            matching_files = [
+                os.path.join(output_folder, f)
+                for f in all_output_files
+                if f.endswith(suffix)
+            ]
+        else:
+            matching_files = [f for f in truncate_files if f.endswith(suffix)]
+
+        if matching_files:
+            matching_files_sorted = sorted(matching_files)
+            all_pdfs_to_merge.extend(matching_files_sorted)
+            for f in matching_files_sorted:
+                print(f"选中文件: {f}")
+        else:
+            print(f"可选文件 '{suffix}' 未找到，继续合并。")
+
+    # 检查是否所有必需的文件都已找到
     if missing_files:
         print("缺少以下必要的 PDF 文件，无法进行合并:")
         for missing in missing_files:
             print(f" - {missing}")
         return ""
 
+    print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
+
     # 过滤掉不存在或为空的文件路径
-    all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)]
+    all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0]
 
     if not all_pdfs_to_merge:
-        print("没有找到要合并的 PDF 文件。")
+        print("没有找到要合并的有效 PDF 文件。")
         return ""
 
     # 调用 merge_pdfs 函数进行合并
@@ -171,134 +202,6 @@ def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_pa
 
     # 检查合并后的文件是否存在且不为空
     if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-        # 合并成功，删除 {base_file_name}_before.pdf 文件
-        before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
-        if os.path.exists(before_pdf_path):
-            try:
-                os.remove(before_pdf_path)
-                # print(f"已删除文件: {before_pdf_path}")
-            except Exception as e:
-                print(f"删除文件 {before_pdf_path} 时出错: {e}")
-        else:
-            print(f"未找到要删除的文件: {before_pdf_path}")
-
-        return output_path
-    else:
-        print(f"合并失败，没有生成 '{output_path}'。")
-        return ""
-
-
-#合并封面+招标公告+投标人须知前附表+须知正文
-def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, base_file_name):
-    """
-    合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件，
-    以及 truncate_files 中以指定后缀结尾的文件，按照指定顺序合并。
-
-    参数：
-    - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。
-    - truncate_files (list): 包含 PDF 文件路径的列表。
-    - output_path (str): 合并后的 PDF 文件保存路径。
-    - base_file_name (str): 用于匹配文件名的基础名称。
-
-    返回:
-    - str: 如果合并成功，返回 output_path；否则，返回空字符串 ""。
-    """
-    # 1. 获取 output_folder 中所有文件
-    try:
-        all_output_files = os.listdir(output_folder)
-    except FileNotFoundError:
-        print(f"输出文件夹 '{output_folder}' 未找到。")
-        return ""
-    except PermissionError:
-        print(f"没有权限访问输出文件夹 '{output_folder}'。")
-        return ""
-
-    # 2. 定义要选择的文件后缀及合并顺序
-    required_suffixes = [
-        f'{base_file_name}_before.pdf',
-        f'{base_file_name}_notice.pdf',
-        f'{base_file_name}_tobidders_notice_part1.pdf'
-    ]
-
-    optional_suffixes = [
-        f'{base_file_name}_tobidders_notice_part2.pdf'
-    ]
-
-    all_pdfs_to_merge = []
-    missing_files = []  # 用于记录缺失的文件
-
-    # 3. 处理必需的文件
-    for suffix in required_suffixes:
-        if suffix == f'{base_file_name}_before.pdf':
-            # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
-            matching_files = [
-                os.path.join(output_folder, f)
-                for f in all_output_files
-                if f.endswith(suffix)
-            ]
-        else:
-            # 从 truncate_files 中选择以指定后缀结尾的文件
-            matching_files = [f for f in truncate_files if f.endswith(suffix)]
-
-        if matching_files:
-            # 如果找到多个匹配的文件，按名称排序并添加
-            matching_files_sorted = sorted(matching_files)
-            all_pdfs_to_merge.extend(matching_files_sorted)
-            # for f in matching_files_sorted:
-                # print(f"选中文件: {f}")
-        else:
-            print(f"没有找到以 '{suffix}' 结尾的文件。")
-            missing_files.append(suffix)  # 记录缺失的文件
-
-    # 4. 处理可选的文件
-    for suffix in optional_suffixes:
-        if suffix == f'{base_file_name}_before.pdf':
-            # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件
-            matching_files = [
-                os.path.join(output_folder, f)
-                for f in all_output_files
-                if f.endswith(suffix)
-            ]
-        else:
-            # 从 truncate_files 中选择以指定后缀结尾的文件
-            matching_files = [f for f in truncate_files if f.endswith(suffix)]
-
-        if matching_files:
-            # 如果找到多个匹配的文件，按名称排序并添加
-            matching_files_sorted = sorted(matching_files)
-            all_pdfs_to_merge.extend(matching_files_sorted)
-            for f in matching_files_sorted:
-                print(f"选中文件: {f}")
-        else:
-            print(f"可选文件 '{suffix}' 未找到，继续合并。")
-
-    # 5. 检查是否所有必需的文件都已找到
-    if missing_files:
-        print("缺少以下必要的 PDF 文件，无法进行合并:")
-        for missing in missing_files:
-            print(f" - {missing}")
-        return ""
-
-    print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}")
-
-    # 6. 过滤掉不存在或为空的文件路径
-    all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0]
-
-    if not all_pdfs_to_merge:
-        print("没有找到要合并的有效 PDF 文件。")
-        return ""
-
-    # 7. 调用 merge_pdfs 函数进行合并
-    try:
-        merge_pdfs(all_pdfs_to_merge, output_path)
-        print(f"已成功合并 PDF 文件到 '{output_path}'。")
-    except Exception as e:
-        print(f"合并 PDF 文件时出错: {e}")
-        return ""
-
-    # 8. 检查合并后的文件是否存在且不为空
-    if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
-        # 合并成功，删除 {base_file_name}_before.pdf 文件
         before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf")
         if os.path.exists(before_pdf_path):
             try:
@@ -308,8 +211,7 @@ def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, ba
                 print(f"删除文件 {before_pdf_path} 时出错: {e}")
         else:
             print(f"未找到要删除的文件: {before_pdf_path}")
-
         return output_path
     else:
         print(f"合并失败，没有生成 '{output_path}'。")
-        return ""
\ No newline at end of file
+        return ""
diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py
new file mode 100644
index 0000000..b435d72
--- /dev/null
+++ b/flask_app/general/截取pdf_main.py
@@ -0,0 +1,127 @@
+import concurrent.futures
+import os
+import time
+
+from flask_app.general.merge_pdfs import merge_selected_pdfs
+from flask_app.general.截取pdf通用函数 import check_pdf_pages
+from flask_app.general.通用功能函数 import get_global_logger
+from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering
+from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods
+
+
+def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None):
+    """
+    统一的 PDF 截取和合并函数，支持 'goods' 和 'engineering' 两种模式，
+    以及默认的全选和指定选择。
+
+    参数：
+    - pdf_path (str): 输入的 PDF 文件路径。
+    - output_folder (str): 输出文件夹路径。
+    - logger (Logger): 日志记录器。
+    - mode (str, 可选): 模式类型，支持 'goods' 和 'engineering'。默认为 'engineering'。
+    - selections (List[int], 可选): 选择的模式列表，默认为 [1, 2, 3, 4, 5]。
+
+    返回:
+    - list: 截取和合并后的文件路径列表，如果截取或合并失败，则包含空字符串。
+    """
+    if selections is None:
+        selections = [1, 2, 3, 4, 5]
+
+    # 确认模式有效
+    if mode not in ['goods', 'engineering']:
+        raise ValueError("mode 参数必须是 'goods' 或 'engineering'")
+    def handle_exception(selection):
+        return ["", ""] if selection == 4 else [""]
+    # 设置模式相关的参数和函数
+    if mode == 'goods':
+        truncate_function = truncate_pdf_main_goods
+        merge_mode = 'goods'
+
+        # 根据 'goods' 模式定义异常处理的逻辑
+    else:  # mode == 'engineering'
+        truncate_function = truncate_pdf_main_engineering
+        merge_mode = 'engineering'
+
+    mode_flag = 1
+    num_selections = len(selections)
+    if num_selections < 5:
+        mode_flag = 2
+
+    res = check_pdf_pages(pdf_path, mode_flag, logger)
+    if res is not None:
+        return res  # 返回包含空字符串的列表
+
+    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]  # 纯文件名
+    truncate_files = []
+
+    # 使用 ThreadPoolExecutor 进行多线程处理
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor:
+        # 提交所有任务并保持 selection 顺序
+        future_to_selection = {
+            selection: executor.submit(
+                truncate_function,
+                pdf_path,
+                output_folder,
+                selection,
+                logger
+                # 如果 'goods' 模式需要额外参数，如 output_suffix，可以在这里处理
+            )
+            for selection in selections
+        }
+
+        # 按 selection 顺序收集结果
+        for selection in selections:
+            future = future_to_selection.get(selection)
+            try:
+                files = future.result()
+                if files:
+                    truncate_files.extend(files)
+                # 无需额外处理，因为 `truncate_function` 已处理空字符串的添加和日志记录
+            except Exception as e:
+                logger.error(f"Selection {selection} 生成了一个异常: {e}")
+                # 根据模式和 selection 添加相应数量的空字符串
+                truncate_files.extend(handle_exception(selection))
+
+    # 定义合并后的输出路径
+    merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
+
+    # 调用 merge_selected_pdfs 并获取返回值
+    merged_path = merge_selected_pdfs(
+        output_folder,
+        truncate_files,
+        merged_output_path,
+        base_file_name,
+        mode=merge_mode
+    )
+
+    if merged_path:
+        # 合并成功，添加合并后的文件路径
+        truncate_files.append(merged_path)
+        logger.info(f"已成功合并 PDF 文件到 '{merged_output_path}'。")
+    else:
+        # 合并失败，添加空字符串
+        truncate_files.append("")
+        logger.warning(f"合并失败，没有生成合并文件 for {pdf_path}")
+
+    logger.info("已截取文件路径: " + str(truncate_files))
+    return truncate_files
+
+if __name__ == "__main__":
+    logger=get_global_logger("123")
+    start_time = time.time()
+    # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
+    pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest5.pdf"
+
+    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
+    # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
+    output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
+    # selections = [1, 4]  # 仅处理 selection 4、1
+    selections=[5]
+    files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections)
+    print(files)
+    # selection = 1  # 例如：1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
+    # generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
+    # print(generated_files)
+    # print("生成的文件:", generated_files)
+    end_time = time.time()
+    print("耗时：" + str(end_time - start_time))
\ No newline at end of file
diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py
new file mode 100644
index 0000000..b00f8f6
--- /dev/null
+++ b/flask_app/general/截取pdf通用函数.py
@@ -0,0 +1,57 @@
+#flask_app/general/截取pdf通用函数.py
+import concurrent.futures
+import os
+
+import regex
+from PyPDF2 import PdfReader
+
+from flask_app.general.clean_pdf import extract_common_header, clean_page_content
+
+def get_start_and_common_header(input_path,end_page):
+    common_header = extract_common_header(input_path)
+    last_begin_index = 0
+    begin_pattern = regex.compile(
+        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
+        regex.MULTILINE
+    )
+    # 新增目录匹配模式
+    catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
+
+    pdf_document = PdfReader(input_path)
+    for i, page in enumerate(pdf_document.pages):
+        if i > end_page:
+            return common_header, 0  # 如果页码大于end_page，直接返回last_begin_index=0
+        text = page.extract_text()
+        if text:
+            cleaned_text = clean_page_content(text, common_header)
+            # 检查是否存在"目录"
+            if catalog_pattern.search(cleaned_text):
+                continue  # 如果存在目录，跳过当前页面
+            if begin_pattern.search(cleaned_text):
+                last_begin_index = i  # 更新第一个匹配的索引，页码从0开始
+                return common_header, last_begin_index
+    return common_header, last_begin_index
+
+def check_pdf_pages(pdf_path, mode,logger):
+    try:
+        reader = PdfReader(pdf_path)
+        num_pages = len(reader.pages)
+        logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
+        if num_pages <= 50:
+            logger.info("PDF页数小于或等于50页，跳过切分逻辑。")
+            if mode==1:
+                return ['', '', '', '', '', '', '']
+            else:
+                return ['','','','']
+        # 若页数大于50页，返回None表示继续处理
+        return None
+    except Exception as e:
+        logger.error(f"无法读取 PDF 页数: {e}")
+        # 返回空列表意味着无法执行后续处理逻辑
+        if mode == 1:
+            return ['', '', '', '', '', '', '']
+        else:
+            return ['', '', '', '']
+
+
+
diff --git a/flask_app/general/通用功能函数.py b/flask_app/general/通用功能函数.py
index 9e12881..3edc369 100644
--- a/flask_app/general/通用功能函数.py
+++ b/flask_app/general/通用功能函数.py
@@ -1,12 +1,9 @@
 # -*- encoding:utf-8 -*-
 import ast
-import json
+import logging
 import re
-from collections import OrderedDict
-
 from flask_app.general.json_utils import clean_json_string
 from flask_app.general.多线程提问 import multi_threading
-from flask_app.general.通义千问long import upload_file
 from flask_app.工程标.判断是否分包等 import read_questions_from_judge
 
 def process_judge_questions(judge_file_path, chosen_numbers, file_id, baseinfo_list1):
@@ -72,3 +69,9 @@ def process_string_list(string_list):
         # 如果没有匹配到内容，返回空列表
         return []
 
+def get_global_logger(unique_id):
+    if unique_id is None:
+        return logging.getLogger()  # 获取默认的日志器
+    logger = logging.getLogger(unique_id)
+    return logger
+
diff --git a/flask_app/old_version/形式响应评审old.py b/flask_app/old_version/形式响应评审old.py
index 94ef52c..7101cc8 100644
--- a/flask_app/old_version/形式响应评审old.py
+++ b/flask_app/old_version/形式响应评审old.py
@@ -6,7 +6,7 @@ import time
 from flask_app.general.多线程提问 import multi_threading
 from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2
 from flask_app.general.json_utils import extract_content_from_json
-from flask_app.工程标.截取pdf import truncate_pdf_main
+from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main
 from flask_app.工程标.提取json工程标版 import convert_clause_to_json
 prompt = """
 # 角色
diff --git a/flask_app/old_version/招标文件解析_old.py b/flask_app/old_version/招标文件解析_old.py
index 52ea4b2..2ef6fd8 100644
--- a/flask_app/old_version/招标文件解析_old.py
+++ b/flask_app/old_version/招标文件解析_old.py
@@ -3,7 +3,7 @@ import json
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor
-from flask_app.工程标.截取pdf import truncate_pdf_multiple
+from flask_app.工程标.截取pdf工程标版 import truncate_pdf_multiple
 from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge
 from flask_app.工程标.提取json工程标版 import convert_clause_to_json
diff --git a/flask_app/old_version/解析old_old.py b/flask_app/old_version/解析old_old.py
index 368f12c..e34424a 100644
--- a/flask_app/old_version/解析old_old.py
+++ b/flask_app/old_version/解析old_old.py
@@ -3,7 +3,7 @@ import json
 import logging
 import time
 from concurrent.futures import ThreadPoolExecutor
-from flask_app.工程标.截取pdf import truncate_pdf_multiple
+from flask_app.工程标.截取pdf工程标版 import truncate_pdf_multiple
 from flask_app.general.table_content_extraction import extract_tables_main
 from flask_app.工程标.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
diff --git a/flask_app/routes/偏离表main.py b/flask_app/routes/偏离表main.py
index 5f2dc14..5491613 100644
--- a/flask_app/routes/偏离表main.py
+++ b/flask_app/routes/偏离表main.py
@@ -1,27 +1,16 @@
 import json
-import logging
-import re
-import string
 import time
 
 from flask_app.general.doubao import doubao_model
 from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx
-from flask_app.general.insert_del_pagemark import insert_mark
 from flask_app.general.json_utils import clean_json_string
-from flask_app.general.通义千问long import upload_file
-from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
+from flask_app.general.通用功能函数 import get_global_logger
+from flask_app.general.截取pdf_main import truncate_pdf_multiple
 from flask_app.货物标.提取采购需求main import fetch_procurement_reqs
 from flask_app.货物标.技术参数要求提取后处理函数 import extract_matching_keys
 from flask_app.货物标.资格审查main import combine_qualification_review
 import concurrent.futures
 
-def get_global_logger(unique_id):
-    if unique_id is None:
-        return logging.getLogger()  # 获取默认的日志器
-    logger = logging.getLogger(unique_id)
-    return logger
-
-
 logger = None
 def get_nested(dic, keys, default=None):
     for key in keys:
@@ -367,7 +356,7 @@ def get_tech_and_business_deviation(file_path,file_type,unique_id,output_folder)
         logger.error("Unsupported file type provided. Preprocessing halted.")
         return None
     selections=[1,3,5]
-    files=truncate_pdf_specific_goods(pdf_path,output_folder,selections,unique_id)
+    files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections)
     notice_path=files[0]
     qualification_file=files[1]
     procurement_file=files[2]
diff --git a/flask_app/routes/小解析main.py b/flask_app/routes/小解析main.py
index 4106b0b..bd8dd1e 100644
--- a/flask_app/routes/小解析main.py
+++ b/flask_app/routes/小解析main.py
@@ -8,21 +8,14 @@ from flask_app.general.format_change import docx2pdf
 from flask_app.general.json_utils import clean_json_string
 from flask_app.general.多线程提问 import read_questions_from_file, multi_threading
 from flask_app.general.通义千问long import upload_file
+from flask_app.general.通用功能函数 import get_global_logger
 from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods
-from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods
-from flask_app.工程标.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main
+from flask_app.general.截取pdf_main import truncate_pdf_multiple
 from flask_app.general.post_processing import inner_post_processing
 from flask_app.工程标.基础信息整合工程标 import aggregate_basic_info_engineering
 
-
-def get_global_logger(unique_id):
-    if unique_id is None:
-        return logging.getLogger()  # 获取默认的日志器
-    logger = logging.getLogger(unique_id)
-    return logger
-
 #货物标
-def little_parse_goods(output_folder, pdf_path):
+def little_parse_goods(output_folder, pdf_path,logger):
     """
     解析货物相关的基础信息。
 
@@ -35,7 +28,7 @@ def little_parse_goods(output_folder, pdf_path):
     """
     # 截取特定的货物 PDF 文件
     selections = [1,4]  # 仅处理 selection 1和4  #公告+投标人须知
-    files = truncate_pdf_specific_goods(pdf_path, output_folder,selections)
+    files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods',selections)
     if not files:
         raise ValueError("未找到截取后的文件。")
     # 假设最后一个文件是需要处理的基础信息文件
@@ -58,7 +51,7 @@ def little_parse_goods(output_folder, pdf_path):
     return {"基础信息": aggregated_baseinfo}
 
 
-def little_parse_engineering(output_folder, pdf_path):
+def little_parse_engineering(output_folder, pdf_path,logger):
     """
     解析工程相关的基础信息。
 
@@ -71,13 +64,14 @@ def little_parse_engineering(output_folder, pdf_path):
     """
     # 截取特定的工程 PDF 文件
     selections = [ 1,4]     #公告+投标人须知前附表
-    files = truncate_pdf_specific_engineering(pdf_path, output_folder,selections)
+    files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering',selections)
     if not files:
         raise ValueError("未找到截取后的文件。")
     # 假设最后一个文件是需要处理的基础信息文件
     baseinfo_file_path = files[-1]
     if not baseinfo_file_path:
-        baseinfo_file_path=truncate_pdf_main(pdf_path,output_folder,5)[0]  #invalid_path
+        selections=[5]
+        baseinfo_file_path=truncate_pdf_multiple(pdf_path,output_folder,logger,selections)[0]  #invalid_path
     # 上传文件并获取文件 ID
     file_id = upload_file(baseinfo_file_path)
     # 注意：以下路径被硬编码，确保该路径存在并且正确
@@ -123,11 +117,11 @@ def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id):
         return None
  # 根据招标类型调用相应的解析函数
     if zb_type == 2:  # 货物标
-        combined_data = little_parse_goods(output_folder, pdf_path)
+        combined_data = little_parse_goods(output_folder, pdf_path,logger)
         logger.info("Parsed goods data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
         res = inner_post_processing(combined_data["基础信息"])
     elif zb_type==1:  #工程标
-        combined_data = little_parse_engineering(output_folder, pdf_path)
+        combined_data = little_parse_engineering(output_folder, pdf_path,logger)
         logger.info("Parsed engineering data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4))
         res = inner_post_processing(combined_data["基础信息"])
         # 定义保存JSON文件的路径
diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py
index 4cb1d3e..fd8f231 100644
--- a/flask_app/routes/工程标解析main.py
+++ b/flask_app/routes/工程标解析main.py
@@ -8,8 +8,9 @@ from concurrent.futures import ThreadPoolExecutor
 from docx import Document
 
 from flask_app.general.insert_del_pagemark import insert_mark,delete_mark
-from flask_app.工程标.截取pdf import truncate_pdf_multiple
+from flask_app.general.截取pdf_main import truncate_pdf_multiple
 from flask_app.general.merge_pdfs import merge_pdfs
+from flask_app.general.通用功能函数 import get_global_logger
 from flask_app.工程标.提取json工程标版 import convert_clause_to_json
 from flask_app.general.json_utils import transform_json_values
 from flask_app.general.无效标和废标公共代码 import combine_find_invalid
@@ -20,16 +21,9 @@ from flask_app.工程标.资格审查模块 import combine_review_standards
 from flask_app.general.商务技术评分提取 import combine_evaluation_standards
 from flask_app.general.format_change import pdf2docx, docx2pdf
 
-
-def get_global_logger(unique_id):
-    if unique_id is None:
-        return logging.getLogger()  # 获取默认的日志器
-    logger = logging.getLogger(unique_id)
-    return logger
-
 # 创建全局线程池
 executor = ThreadPoolExecutor()
-def preprocess_files(output_folder, file_path, file_type,unique_id,logger):
+def preprocess_files(output_folder, file_path, file_type,logger):
     logger.info("starting 文件预处理...")
     logger.info("output_folder..." + output_folder)
     start_time=time.time()
@@ -47,20 +41,20 @@ def preprocess_files(output_folder, file_path, file_type,unique_id,logger):
         logger.error("Unsupported file type provided. Preprocessing halted.")
         return None
     # 调用截取PDF多次
-    truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id)
+    truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering')
     print("切割出的文件："+str(truncate_files))
 
     # 处理各个部分
-    tobidders_notice_table=truncate_files[0]     #投标人须知前附表
+    notice_path=truncate_files[0]     #招标公告
 
-    tobidders_notice = truncate_files[1]      #投标人须知正文
+    evaluation_method = truncate_files[1]      #评标方法
 
-    evaluation_method = truncate_files[2]           #评标方法
+    qualification = truncate_files[2]           #资格审查
 
-    qualification = truncate_files[3]       #资格审查
-    notice_path=truncate_files[4]   #公告
+    tobidders_notice_table = truncate_files[3]       #投标人须知前附表
+    tobidders_notice=truncate_files[4]   #投标人须知正文
 
-    invalid_path=truncate_files[5]
+    invalid_path=truncate_files[5]     #无效标
     # invalid_docpath = copy_docx(docx_path)  # docx截取无效标部分
     # invalid_docpath=pdf2docx(invalid_path)
     invalid_added_pdf = insert_mark(invalid_path)
@@ -215,7 +209,7 @@ def fetch_bid_opening(invalid_deleted_docx, merged_baseinfo_path_more, clause_pa
 def engineering_bid_main(output_folder, file_path, file_type, unique_id):
     logger = get_global_logger(unique_id)
     # 预处理文件，获取处理后的数据
-    processed_data = preprocess_files(output_folder, file_path, file_type,unique_id,logger)
+    processed_data = preprocess_files(output_folder, file_path, file_type,logger)
     if not processed_data:
         yield json.dumps({})  # 如果处理数据失败，返回空的 JSON
 
diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py
index cb4b363..6798e85 100644
--- a/flask_app/routes/货物标解析main.py
+++ b/flask_app/routes/货物标解析main.py
@@ -7,23 +7,16 @@ from docx import Document
 from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx
 from flask_app.general.insert_del_pagemark import insert_mark, delete_mark
 from flask_app.general.json_utils import transform_json_values
+from flask_app.general.通用功能函数 import get_global_logger
 from flask_app.货物标.基础信息解析main import combine_basic_info
 from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice
-from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple
+from flask_app.general.截取pdf_main import truncate_pdf_multiple
 from concurrent.futures import ThreadPoolExecutor
 import concurrent.futures
 from flask_app.货物标.提取json货物标版 import convert_clause_to_json
 from flask_app.general.无效标和废标公共代码 import combine_find_invalid
 from flask_app.货物标.资格审查main import combine_qualification_review
 from flask_app.general.商务技术评分提取 import combine_evaluation_standards
-import logging
-
-
-def get_global_logger(unique_id):
-    if unique_id is None:
-        return logging.getLogger()  # 获取默认的日志器
-    logger = logging.getLogger(unique_id)
-    return logger
 
 # 创建全局线程池
 executor = ThreadPoolExecutor()
@@ -47,7 +40,7 @@ def preprocess_files(output_folder, file_path, file_type,logger):
         return None
 
     # 调用截取PDF多次
-    truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger)  # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
+    truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods')  # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文
 
     # 处理各个部分
     invalid_path=pdf_path
diff --git a/flask_app/工程标/截取pdf.py b/flask_app/工程标/截取pdf工程标版.py
similarity index 69%
rename from flask_app/工程标/截取pdf.py
rename to flask_app/工程标/截取pdf工程标版.py
index f4ac13e..9af7820 100644
--- a/flask_app/工程标/截取pdf.py
+++ b/flask_app/工程标/截取pdf工程标版.py
@@ -1,21 +1,14 @@
+#flask_app/工程标/截取pdf工程标版.py
 import regex
 import os
 import time
 from PyPDF2 import PdfReader, PdfWriter
-from flask_app.general.clean_pdf import clean_page_content, extract_common_header
-from flask_app.general.merge_pdfs import merge_selected_pdfs_for_engineering
-import concurrent.futures
-import logging
+from flask_app.general.clean_pdf import clean_page_content
+
+from flask_app.general.截取pdf通用函数 import get_start_and_common_header
+from flask_app.general.通用功能函数 import get_global_logger
 
 
-def get_global_logger(unique_id):
-    if unique_id is None:
-        return logging.getLogger()  # 获取默认的日志器
-    logger = logging.getLogger(unique_id)
-    return logger
-
-
-logger = None
 def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header):
     try:
         # 获取文件基本名称
@@ -301,67 +294,39 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l
         return []
 
 
-def get_start_and_common_header(input_path):
-    common_header = extract_common_header(input_path)
-    last_begin_index = 0
-    begin_pattern = regex.compile(
-        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
-        regex.MULTILINE
-    )
-    # 新增目录匹配模式
-    catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
-
-    pdf_document = PdfReader(input_path)
-    for i, page in enumerate(pdf_document.pages):
-        if i > 10:
-            return common_header, 0  # 如果页码大于10，直接返回last_begin_index=0
-        text = page.extract_text()
-        if text:
-            cleaned_text = clean_page_content(text, common_header)
-            # 检查是否存在"目录"
-            if catalog_pattern.search(cleaned_text):
-                continue  # 如果存在目录，跳过当前页面
-            if begin_pattern.search(cleaned_text):
-                last_begin_index = i  # 更新第一个匹配的索引，页码从0开始
-                return common_header, last_begin_index
-    return common_header, last_begin_index
-
-def truncate_pdf_main(input_path, output_folder, selection):
+def truncate_pdf_main_engineering(input_path, output_folder, selection,logger):
     if os.path.isdir(input_path):
         generated_files = []
         for file_name in os.listdir(input_path):
             if file_name.endswith('.pdf'):
                 file_path = os.path.join(input_path, file_name)
-                files = truncate_pdf_main(file_path, output_folder, selection)
+                files = truncate_pdf_main_engineering(file_path, output_folder, selection)
                 if files:
                     generated_files.extend(files)
         return generated_files
     elif os.path.isfile(input_path) and input_path.endswith('.pdf'):
         # base_file_name = os.path.splitext(os.path.basename(input_path))[0]
-        common_header, last_begin_index = get_start_and_common_header(input_path)
+        common_header, last_begin_index = get_start_and_common_header(input_path,20)
         # print(last_begin_index)
         if selection == 1:
-            # Selection 1: 投标人须知前附表
+            # Selection 1: 招标公告
             pattern_pairs = [
                 (
                     regex.compile(
-                        r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',regex.MULTILINE),
-                    regex.compile(
-                        r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[：:]|^附件(?:一)?[：:]|^附表(?:一)?[：:]',
-                        regex.MULTILINE)
+                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
+                    regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
                 ),
                 (
                     regex.compile(
-                        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
-                        r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
-                        regex.MULTILINE
-                    ),
+                        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
+                        regex.MULTILINE),
                     regex.compile(
-                        r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[：:]|^附件(?:一)?[：:]|^附表(?:一)?[：:]',
-                        regex.MULTILINE)
+                        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
+                        regex.MULTILINE
+                    )
                 )
             ]
-            output_suffix = "tobidders_notice"
+            output_suffix = "notice"
         elif selection == 2:
             # Selection 2: 评标办法
             pattern_pairs = [
@@ -408,23 +373,28 @@ def truncate_pdf_main(input_path, output_folder, selection):
             ]
             output_suffix = "qualification"
         elif selection == 4:
-            # Selection 4: 招标公告
+            # Selection 4: 投标人须知
             pattern_pairs = [
-                (
-                    regex.compile(
-                        r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'),
-                    regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE)
+            (
+                regex.compile(
+                    r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',
+                    regex.MULTILINE),
+                regex.compile(
+                    r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[：:]|^附件(?:一)?[：:]|^附表(?:一)?[：:]',
+                    regex.MULTILINE)
+            ),
+            (
+                regex.compile(
+                    r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|'
+                    r'(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表\s*$',
+                    regex.MULTILINE
                 ),
-                (
-                    regex.compile(r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$', regex.MULTILINE),
-                    regex.compile(
-                        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知(?:前附表)?\s*$',
-                        regex.MULTILINE
-                    )
-                )
-            ]
-            output_suffix = "notice"
-
+                regex.compile(
+                    r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[：:]|^附件(?:一)?[：:]|^附表(?:一)?[：:]',
+                    regex.MULTILINE)
+            )
+        ]
+            output_suffix = "tobidders_notice"
         elif selection == 5:
             # Selection 5: 无效标
             pattern_pairs = [
@@ -447,13 +417,13 @@ def truncate_pdf_main(input_path, output_folder, selection):
         for idx, (begin_pattern, end_pattern) in enumerate(pattern_pairs, start=1):
             is_secondary_match = (idx == 2)
             begin_page = last_begin_index if last_begin_index != 0 else {
-                1: 3,  # 前附表
+                1: 0,  # 公告
                 2: 10,  # 评标
                 3: 5,  # 资格
-                4: 0,  # 公告
+                4: 3,  # 前附表
                 5: 0  # 无效标
             }.get(selection, 0)
-            if selection == 1:  # 投标人须知
+            if selection == 4:  # 投标人须知
                 output_paths = list(
                     extract_pages_tobidders_notice(input_path, output_folder, begin_pattern, begin_page, common_header,
                                                    is_secondary_match))
@@ -485,148 +455,29 @@ def truncate_pdf_main(input_path, output_folder, selection):
         if fallback_result and any(os.path.isfile(f) for f in fallback_result):
             return fallback_result
         else:
-            if selection == 1:
-                return ["", ""]
-            else:
-                return ['']
+            # 根据 selection 返回相应数量的空字符串
+            empty_returns = ["", ""] if selection == 4 else [""]
+            logger.error(f"Selection {selection}: 回退提取失败，返回空字符串。")
+            return empty_returns
     else:
         print("Invalid input path. It is neither a PDF file nor a directory containing PDF files.")
         return ['']
 
-
-def truncate_pdf_multiple(input_path, output_folder, unique_id="123"):
-    logger = get_global_logger(unique_id)
-    # 新增逻辑：检查 PDF 总页数
-    try:
-        reader = PdfReader(input_path)
-        num_pages = len(reader.pages)
-        logger.info(f"PDF '{input_path}' 的总页数为: {num_pages}")
-        if num_pages <= 50:
-            logger.info(f"PDF页数小于或等于50页，跳过切分逻辑。")
-            return ['', '', '', '', '', '', '']
-    except Exception as e:
-        logger.error(f"无法读取 PDF 页数: {e}")
-        return ['', '', '', '', '', '', '']
-    base_file_name = os.path.splitext(os.path.basename(input_path))[0]  # 纯文件名
-    truncate_files = []
-    selections = range(1, 6)  # 选择 1 到 5
-
-    # 使用 ThreadPoolExecutor 进行多线程处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
-        # 提交所有任务并保持 selection 顺序
-        future_to_selection = {selection: executor.submit(truncate_pdf_main, input_path, output_folder, selection) for
-                               selection in selections}
-
-        # 按 selection 顺序收集结果
-        for selection in selections:
-            future = future_to_selection.get(selection)
-            try:
-                files = future.result()
-                if files and any(f for f in files):
-                    # 过滤空字符串
-                    valid_files = [f for f in files if f]
-                    truncate_files.extend(valid_files)
-                else:
-                    logger.error(f"Selection {selection}: 截取失败，已添加空字符串。")
-                    # Append two empty strings for selection=1; otherwise, one empty string
-                    truncate_files.extend(["", ""] if selection == 1 else [""])
-            except Exception as e:
-                logger.error(f"Selection {selection} 生成了一个异常: {e}")
-                # Append two empty strings for selection=1; otherwise, one empty string
-                truncate_files.extend(["", ""] if selection == 1 else [""])
-
-    # Proceed with merging logic as before...
-    if any(f for f in truncate_files if f):  # 检查是否有有效的文件路径
-        merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
-        merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path,
-                                                            base_file_name)
-        if merged_result:
-            truncate_files.append(merged_result)
-            # logger.info(f"merged_baseinfo: 已生成合并文件: {merged_output_path}")
-        else:
-            truncate_files.append("")  # 如果 merged_result 未生成，添加空字符串
-            logger.warning("merged_baseinfo: 未生成合并文件，因为没有找到需要合并的 PDF 文件。")
-    else:
-        truncate_files.append("")  # 如果没有文件需要合并，也添加空字符串
-        logger.warning(f"merged_baseinfo: 没有文件需要合并 for {input_path}")
-
-    return truncate_files
-
-
-def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, unique_id="123"):
-    try:
-        logger = get_global_logger(unique_id)
-        try:
-            reader = PdfReader(input_path)
-            num_pages = len(reader.pages)
-            logger.info(f"PDF '{input_path}' 的总页数为: {num_pages}")
-            if num_pages <= 50:
-                logger.info(f"PDF页数小于或等于50页，跳过切分逻辑。")
-                return ['', '', '', '']
-        except Exception as e:
-            logger.error(f"无法读取 PDF 页数: {e}")
-            return ['', '', '', '']
-        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
-        truncate_files = []
-
-        # 使用 ThreadPoolExecutor 进行多线程处理
-        with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
-            # 提交所有任务并保持 selection 顺序
-            future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection) for
-                                   selection in selections}
-
-            # 按 selection 顺序收集结果
-            for selection in selections:
-                future = future_to_selection.get(selection)
-                try:
-                    files = future.result()
-                    if files and any(f for f in files):
-                        # 过滤空字符串
-                        valid_files = [f for f in files if f]
-                        truncate_files.extend(valid_files)
-                    else:
-                        logger.error(f"Selection {selection}: 截取失败，已添加空字符串。")
-                        # Append two empty strings for selection=1; otherwise, one empty string
-                        truncate_files.extend(["", ""] if selection == 1 else [""])
-                except Exception as e:
-                    logger.error(f"Selection {selection} 生成了一个异常: {e}")
-                    # Append two empty strings for selection=1; otherwise, one empty string
-                    truncate_files.extend(["", ""] if selection == 1 else [""])
-
-        # Proceed with merging logic as before...
-        if any(f for f in truncate_files if f):  # 检查是否有有效的文件路径
-            merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
-            merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path,
-                                                                base_file_name)
-            if merged_result:
-                truncate_files.append(merged_result)
-                logger.info(f"merged_specific: 已生成合并文件: {merged_output_path}")
-            else:
-                truncate_files.append("")  # 如果 merged_result 未生成，添加空字符串
-                logger.warning("merged_specific: 未生成合并文件，因为没有找到需要合并的 PDF 文件。")
-        else:
-            truncate_files.append("")  # 如果没有文件需要合并，也添加空字符串
-            logger.warning(f"merged_specific: 没有文件需要合并 for {pdf_path}")
-
-        return truncate_files
-
-    except Exception as e:
-        logger.error(f"Error in truncate_pdf_specific_engineering: {e}")
-        return [""] * len(selections)  # 返回与 selections 数量相同的空字符串列表
-
 if __name__ == "__main__":
+    logger=get_global_logger("123")
     start_time = time.time()
     # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标"
-    input_path=r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\西藏监管局办.pdf"
-    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
+    pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest4_evaluation_method.pdf"
+
+    # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf"
     # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
     output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp"
-    files=truncate_pdf_multiple(input_path,output_folder)
-    # selections = [4, 1]  # 仅处理 selection 4、1
-    # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
-    print(files)
-    # selection = 2  # 例如：1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标
-    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
+    # selections = [1, 4]  # 仅处理 selection 4、1
+    selections=[5]
+    # files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections)
+    # print(files)
+    # selection = 1  # 例如：1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标
+    # generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger)
     # print(generated_files)
     # print("生成的文件:", generated_files)
     end_time = time.time()
diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py
index d584149..b20d658 100644
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@@ -1,22 +1,14 @@
-import glob
 import logging
-
 from PyPDF2 import PdfReader, PdfWriter
 import regex  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
-
 from flask_app.general.clean_pdf import clean_page_content, extract_common_header
 from flask_app.general.format_change import docx2pdf
-from flask_app.general.merge_pdfs import merge_and_cleanup, merge_selected_pdfs_for_goods
-import concurrent.futures
+from flask_app.general.merge_pdfs import merge_and_cleanup
+from flask_app.general.截取pdf通用函数 import get_start_and_common_header
+from flask_app.general.通用功能函数 import get_global_logger
 
 
-def get_global_logger(unique_id):
-    if unique_id is None:
-        return logging.getLogger()  # 获取默认的日志器
-    logger = logging.getLogger(unique_id)
-    return logger
-
 def is_pdf_or_doc(filename):
     # 判断文件是否为PDF或Word文档
     return filename.lower().endswith(('.pdf', '.doc', '.docx'))
@@ -28,21 +20,6 @@ def convert_to_pdf(file_path):
         return docx2pdf(file_path)
     return file_path
 
-
-def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
-    pdf_path = convert_to_pdf(file_path)
-    result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
-    if result:
-        if output_suffix == "tobidders_notice":
-            # 确保返回的是元组，并将其中的 None 转换为 ""
-            path1, path2 = result
-            return [path1 or "", path2 or ""]
-        elif output_suffix == "qualification1":
-            merge_and_cleanup(result, "qualification3")
-            return [result or ""]
-        return [result or ""]
-    return [""]  # 返回空字符串
-
 def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header, exclusion_pattern=None,
                           output_suffix="normal"):
     start_page = None
@@ -124,7 +101,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
                 print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！尝试备用提取策略。")
                 return extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page)
             elif output_suffix == "qualification1":
-                truncate_pdf_main(pdf_path, output_folder, 2, "qualification3")  # 合并'资格审查'章节和'评标办法'章节
+                truncate_pdf_main_goods(pdf_path, output_folder, 2, "qualification3")  # 合并'资格审查'章节和'评标办法'章节
             return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
     except Exception as e:
         print(f"Error processing {pdf_path}: {e}")
@@ -171,26 +148,6 @@ def get_patterns_for_notice():
     )
     return begin_pattern, end_pattern
 
-# def extract_pages_tobidders_notice(pdf_document, begin_pattern, end_pattern, begin_page, common_header,
-#                                    exclusion_pattern):
-#     start_page = None
-#     mid_page = None
-#     end_page = None
-#     for i, page in enumerate(pdf_document.pages):
-#         text = page.extract_text() or ""
-#         cleaned_text = clean_page_content(text, common_header)
-#         if exclusion_pattern and regex.search(exclusion_pattern, cleaned_text) and mid_page is not None:
-#             continue
-#         if start_page is None and regex.search(begin_pattern, cleaned_text) and i > begin_page:
-#             start_page = i
-#         if start_page is not None and mid_page is None and regex.search(
-#                 r'^\s*[（(]?\s*[一1]\s*[)）]?\s*[、．.]*\s*(说\s*明|总\s*则)', cleaned_text):
-#             mid_page = i
-#         if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page:
-#             end_page = i
-#             break
-#     return start_page, mid_page, end_page
-
 def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern):
     """
     从PDF文档中提取起始页、中间页和结束页。
@@ -510,7 +467,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b
                             new_file.write(original_file.read())
                     return new_file_path
                 else:
-                    temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2")
+                    temp = truncate_pdf_main_goods(pdf_path, output_folder, 2, "qualification2")
                     if len(temp) > 0:
                         return temp[0]
                     else:
@@ -558,33 +515,8 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
         print(f"Error in save_extracted_pages: {e}")
         return ""  # 返回空字符串
 
-def get_start_and_common_header(input_path):
-    common_header = extract_common_header(input_path)
-    last_begin_index = 0
-    begin_pattern = regex.compile(
-        r'.*(?<!见\s*)(?<!与\s*)(?<!"\s*)(?<!“\s*)(?<!”\s*)(?:招标公告|磋商公告|谈判公告|邀请书|邀请函|投标邀请|磋商邀请|谈判邀请)[\)）]?\s*$',
-        regex.MULTILINE
-    )
-    # 新增目录匹配模式
-    catalog_pattern = regex.compile(r'\s*目\s*录\s*$', regex.MULTILINE)
 
-    pdf_document = PdfReader(input_path)
-    for i, page in enumerate(pdf_document.pages):
-        if i > 10:
-            return common_header, 0  # 如果页码大于10，直接返回last_begin_index=0
-        text = page.extract_text()
-        if text:
-            cleaned_text = clean_page_content(text, common_header)
-            # 检查是否存在"目录"
-            if catalog_pattern.search(cleaned_text):
-                continue  # 如果存在目录，跳过当前页面
-            if begin_pattern.search(cleaned_text):
-                last_begin_index = i  # 更新第一个匹配的索引，页码从0开始
-                return common_header, last_begin_index
-    return common_header, last_begin_index
-
-
-def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"):
+def truncate_pdf_main_goods(input_path, output_folder, selection, output_suffix="default"):
     try:
         # 检查是否为文件夹
         if os.path.isdir(input_path):
@@ -619,7 +551,7 @@ def process_input(input_path, output_folder, selection, output_suffix):
             os.makedirs(output_folder)
 
         # 获取起始和通用页眉
-        common_header, last_begin_index = get_start_and_common_header(input_path)
+        common_header, last_begin_index = get_start_and_common_header(input_path,10)
         begin_page = last_begin_index if last_begin_index != 0 else {
             4: 1,
             2: 5,
@@ -676,145 +608,43 @@ def process_input(input_path, output_folder, selection, output_suffix):
         if output_suffix == "default":
             output_suffix = local_output_suffix
 
-        # 调用实际处理文件内容的函数
-        return process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
+            # 将原先的 process_files 逻辑合并到此处
+            pdf_path = convert_to_pdf(input_path)
+            result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
+
+            # 根据提取结果以及不同的 output_suffix 进行处理
+            if result:
+                if output_suffix == "tobidders_notice":
+                    # 确保返回的是元组，并将其中的 None 转换为 ""
+                    path1, path2 = result
+                    return [path1 or "", path2 or ""]
+                elif output_suffix == "qualification1":
+                    merge_and_cleanup(result, "qualification3")
+                    return [result or ""]
+                return [result or ""]
+
+            return [""]
 
     except Exception as e:
         print(f"Error in process_input: {e}")
         return ['']
 
 
-def truncate_pdf_multiple(pdf_path, output_folder, logger):
-    # 新增逻辑：检查 PDF 总页数
-    try:
-        reader = PdfReader(pdf_path)
-        num_pages = len(reader.pages)
-        logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
-        if num_pages <= 50:
-            logger.info(f"PDF页数小于或等于50页，跳过切分逻辑。")
-            return ['', '', '', '', '', '', '']
-    except Exception as e:
-        logger.error(f"无法读取 PDF 页数: {e}")
-        return ['', '', '', '', '', '', '']
-
-    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
-    truncate_files = []
-
-    # 定义要处理的选择范围
-    selections = range(1, 6)
-
-    # 使用 ThreadPoolExecutor 进行多线程处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-        # 提交所有任务并保持 selection 顺序
-        future_to_selection = {
-            selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
-            for selection in selections}
-
-        # 按 selection 顺序收集结果
-        for selection in selections:
-            future = future_to_selection.get(selection)
-            try:
-                files = future.result()
-                if files:
-                    truncate_files.extend(files)
-            except Exception as e:
-                logger.error(f"Selection {selection} 生成了一个异常: {e}")
-
-    # 定义合并后的输出路径
-    merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
-    # 调用 merge_selected_pdfs 并获取返回值
-    merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
-    if merged_path:
-        # 合并成功，添加合并后的文件路径
-        truncate_files.append(merged_path)
-        # logger.info(f"已生成合并文件: {merged_output_path}")
-    else:
-        # 合并失败，添加空字符串
-        truncate_files.append("")
-        logger.warning(f"合并失败，没有生成合并文件 for {pdf_path}")
-    logger.info("已截取文件路径" + str(truncate_files))
-    return truncate_files
-
-
-# 小解析，只需要前三章内容
-def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="123"):
-    """
-    处理 PDF 文件，选择指定的 selections，并合并结果。
-
-    Args:
-        pdf_path (str): 要处理的 PDF 文件路径。
-        output_folder (str): 截取后的文件保存文件夹路径。
-        selections (iterable): 要处理的 selection 列表。
-
-    Returns:
-        list: 截取的文件路径列表，包括合并后的文件路径（如果有）。
-    """
-    logger = get_global_logger(unique_id)
-    # 新增逻辑：检查 PDF 总页数
-    try:
-        reader = PdfReader(pdf_path)
-        num_pages = len(reader.pages)
-        logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}")
-        if num_pages <= 50:
-            logger.info(f"PDF页数小于或等于50页，跳过切分逻辑。")
-            return ['', '', '', '']
-    except Exception as e:
-        logger.error(f"无法读取 PDF 页数: {e}")
-        return ['', '', '', '']
-    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
-    truncate_files = []
-
-    # 使用 ThreadPoolExecutor 进行多线程处理
-    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
-        # 提交所有任务并保持 selection 顺序
-        future_to_selection = {
-            selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
-            for selection in selections}
-
-        # 按 selection 顺序收集结果
-        for selection in selections:
-            future = future_to_selection.get(selection)
-            try:
-                files = future.result()
-                if files:
-                    if isinstance(files, list):
-                        truncate_files.extend(files)
-                    elif isinstance(files, str):
-                        truncate_files.append(files)
-            except Exception as e:
-                logger.error(f"Selection {selection} 生成了一个异常: {e}")
-
-    # 定义合并后的输出路径
-    merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
-    # 调用 merge_selected_pdfs 并获取返回值
-    merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name)
-    if merged_path:
-        # 合并成功，添加合并后的文件路径
-        truncate_files.append(merged_path)
-        # logger.info(f"已生成合并文件: {merged_output_path}")
-    else:
-        # 合并失败，添加空字符串
-        truncate_files.append("")
-        logger.warning(f"合并失败，没有生成merged_baseinfo for {pdf_path}")
-    return truncate_files
-
-
 # TODO:交通智能系统和招标(1)(1)文件有问题  包头 绍兴     工程标中，判断是符合性审查之后，可以将它们设为同一章
 
 # ztbfile.pdf少资格评审  包头少符合性评审
 if __name__ == "__main__":
     logger = get_global_logger("123")
     # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标"
-    # input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
+    pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf"
     # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf"
     # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件（广水市教育局封闭管理）.pdf"
-    input_path=r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\ztbfile.pdf"
+    # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf"
     output_folder = r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\tmp"
     # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2"
-    # files = truncate_pdf_multiple(input_path, output_folder,logger)
-    # selections = [1,4]
-    # files=truncate_pdf_specific_goods(input_path,output_folder,selections)
+    # selections = [1, 4]
+    # files = truncate_pdf_multiple(pdf_path, output_folder,logger,selections)
     # print(files)
-    selection = 4  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
-    generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    print(generated_files)
+    # selection = 4  # 例如：1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-采购需求
+    # generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection)
+    # print(generated_files)