10.25切分改为多线程

2024-10-25 15:13:09 +08:00 · 2024-10-25 15:13:09 +08:00 · 6ecc370d27
commit 6ecc370d27
parent 7a03351f28
3 changed files with 748 additions and 427 deletions
--- a/flask_app/general/post_processing.py
+++ b/flask_app/general/post_processing.py
--- a/flask_app/main/截取pdf.py
+++ b/flask_app/main/截取pdf.py
@ -2,6 +2,7 @@ from PyPDF2 import PdfReader, PdfWriter
 import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
 from flask_app.general.merge_pdfs import merge_pdfs
+import concurrent.futures
 def clean_page_content(text, common_header):
    # 首先删除抬头公共部分
    if common_header:  # 确保有公共抬头才进行替换
@ -390,15 +391,41 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na
        print(f"合并 PDF 文件时出错: {e}")
        return ""

+
 def truncate_pdf_multiple(input_path, output_folder):
+    """
+    处理 PDF 文件，选择 selection 1-6 的部分，并合并结果。
+
+    Args:
+        input_path (str): 要处理的 PDF 文件路径。
+        output_folder (str): 截取后的文件保存文件夹路径。
+
+    Returns:
+        list: 截取的文件路径列表，包括合并后的文件路径（如果有）。
+    """
    base_file_name = os.path.splitext(os.path.basename(input_path))[0]  # 纯文件名
    truncate_files = []
-    for selection in range(1, 7):
-        files = truncate_pdf_main(input_path, output_folder, selection)
-        if files:
-            truncate_files.extend(files)
-        else:
-            truncate_files.append("")  # 截取失败时添加空字符串
+    selections = range(1, 7)  # 选择 1 到 6
+
+    # 使用 ThreadPoolExecutor 进行多线程处理
+    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
+        # 提交所有任务并保持 selection 顺序
+        future_to_selection = {selection: executor.submit(truncate_pdf_main, input_path, output_folder, selection) for selection in selections}
+
+        # 按 selection 顺序收集结果
+        for selection in selections:
+            future = future_to_selection.get(selection)
+            try:
+                files = future.result()
+                if files and any(f for f in files):
+                    # 过滤空字符串
+                    valid_files = [f for f in files if f]
+                    truncate_files.extend(valid_files)
+                else:
+                    truncate_files.append("")  # 截取失败时添加空字符串
+            except Exception as e:
+                print(f"Selection {selection} 生成了一个异常: {e}")
+                truncate_files.append("")  # 发生异常时添加空字符串

    if any(f for f in truncate_files if f):  # 检查是否有有效的文件路径
        merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
@ -414,14 +441,16 @@ def truncate_pdf_multiple(input_path, output_folder):
        print(f"没有文件需要合并 for {input_path}")

    return truncate_files
+
+
 def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
    """
-    处理 PDF 文件，选择 selection 为指定部分，并合并结果。
+    处理 PDF 文件，选择指定的 selections，并合并结果。

    Args:
        pdf_path (str): 要处理的 PDF 文件路径。
        output_folder (str): 截取后的文件保存文件夹路径。
-        selections (list): 需要截取的部分
+        selections (list): 需要截取的部分（例如 [4, 5]）。

    Returns:
        list: 截取的文件路径列表，包括合并后的文件路径（如果有）。
@ -430,19 +459,29 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections):
        base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
        truncate_files = []

-        for selection in selections:
-            files = truncate_pdf_main(pdf_path, output_folder, selection)
-            if files:
-                if isinstance(files, list):
-                    truncate_files.extend(files)
-                elif isinstance(files, str):
-                    truncate_files.append(files)
-            else:
-                truncate_files.append("")  # 截取失败时添加空字符串
-                print(f"截取 selection {selection} 失败，已添加空字符串。")
+        # 使用 ThreadPoolExecutor 进行多线程处理
+        with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
+            # 提交所有任务并保持 selection 顺序
+            future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection) for selection in selections}
+
+            # 按 selection 顺序收集结果
+            for selection in selections:
+                future = future_to_selection.get(selection)
+                try:
+                    files = future.result()
+                    if files and any(f for f in files):
+                        # 过滤空字符串
+                        valid_files = [f for f in files if f]
+                        truncate_files.extend(valid_files)
+                    else:
+                        truncate_files.append("")  # 截取失败时添加空字符串
+                        print(f"截取 selection {selection} 失败，已添加空字符串。")
+                except Exception as e:
+                    print(f"Selection {selection} 生成了一个异常: {e}")
+                    truncate_files.append("")  # 发生异常时添加空字符串

        if any(f for f in truncate_files if f):  # 检查是否有有效的文件路径
-            merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
+            merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
            merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
            if merged_result:
                truncate_files.append(merged_result)
@ -468,11 +507,11 @@ if __name__ == "__main__":
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\e378e002-45ff-440d-a65a-9974b5015472\\ztbfile.pdf"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf"
    output_folder="C:\\Users\\Administrator\\Desktop\\招标文件\\output6"
-    # files=truncate_pdf_multiple(input_path,output_folder)
+    files=truncate_pdf_multiple(input_path,output_folder)
    # selections = [5, 1]  # 仅处理 selection 5、1 和 3
    # files=truncate_pdf_specific_engineering(input_path,output_folder,selections)
-    # print(files)
-    selection = 6 # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
-    generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    print(generated_files)
-    # print("生成的文件:", generated_files)
+    print(files)
+    # selection = 6 # 例如：1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-资格审查条件 5-招标公告 6-无效标
+    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
+    # print(generated_files)
+    # print("生成的文件:", generated_files)
--- a/flask_app/货物标/截取pdf货物标版.py
+++ b/flask_app/货物标/截取pdf货物标版.py
@ -3,7 +3,7 @@ import re  # 导入正则表达式库
 import os  # 用于文件和文件夹操作
 from flask_app.general.format_change import docx2pdf
 from flask_app.general.merge_pdfs import merge_and_cleanup,merge_pdfs
-
+import concurrent.futures
 def clean_page_content(text, common_header):
    # 首先删除抬头公共部分
    if common_header:  # 确保有公共抬头才进行替换
@ -431,8 +431,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
            else:
                print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中！")
                return ""
-        return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix,
-                                    )
+        return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
    except Exception as e:
        print(f"Error in extract_pages_twice: {e}")
        return ""
@ -624,14 +623,30 @@ def truncate_pdf_main(input_path, output_folder, selection, output_suffix="defau
 def truncate_pdf_multiple(pdf_path, output_folder):
    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    truncate_files = []
-    for selection in range(1, 6):
-        files = truncate_pdf_main(pdf_path, output_folder, selection)
-        if files:
-            truncate_files.extend(files)
+
+    # 定义要处理的选择范围
+    selections = range(1, 6)
+
+    # 使用 ThreadPoolExecutor 进行多线程处理
+    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+        # 提交所有任务并保持 selection 顺序
+        future_to_selection = {
+            selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default")
+            for selection in selections}
+
+        # 按 selection 顺序收集结果
+        for selection in selections:
+            future = future_to_selection.get(selection)
+            try:
+                files = future.result()
+                if files:
+                    truncate_files.extend(files)
+            except Exception as e:
+                print(f"Selection {selection} 生成了一个异常: {e}")

    if truncate_files:
        merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf")
-        merge_selected_pdfs(output_folder, truncate_files, merged_output_path,base_file_name)
+        merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name)
        truncate_files.append(merged_output_path)
        print(f"已生成合并文件: {merged_output_path}")
    else:
@ -640,13 +655,14 @@ def truncate_pdf_multiple(pdf_path, output_folder):
    return truncate_files

 #小解析，只需要前三章内容
-def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
+def truncate_pdf_specific_goods(pdf_path, output_folder, selections):
    """
-    处理 PDF 文件，选择 selection 为 4 和 5 的部分，并合并结果。
+    处理 PDF 文件，选择指定的 selections，并合并结果。

    Args:
        pdf_path (str): 要处理的 PDF 文件路径。
        output_folder (str): 截取后的文件保存文件夹路径。
+        selections (iterable): 要处理的 selection 列表。

    Returns:
        list: 截取的文件路径列表，包括合并后的文件路径（如果有）。
@ -654,13 +670,23 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections):
    base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    truncate_files = []

-    for selection in selections:
-        files = truncate_pdf_main(pdf_path, output_folder, selection)
-        if files:
-            if isinstance(files, list):
-                truncate_files.extend(files)
-            elif isinstance(files, str):
-                truncate_files.append(files)
+    # 使用 ThreadPoolExecutor 进行多线程处理
+    with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor:
+        # 提交所有任务并保持 selection 顺序
+        future_to_selection = {selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") for selection in selections}
+
+        # 按 selection 顺序收集结果
+        for selection in selections:
+            future = future_to_selection.get(selection)
+            try:
+                files = future.result()
+                if files:
+                    if isinstance(files, list):
+                        truncate_files.extend(files)
+                    elif isinstance(files, str):
+                        truncate_files.append(files)
+            except Exception as e:
+                print(f"Selection {selection} 生成了一个异常: {e}")

    if truncate_files:
        merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf")
@ -675,16 +701,17 @@ def truncate_pdf_specific_goods(pdf_path, output_folder,selections):

 # TODO:交通智能系统和招标(1)(1)文件有问题  包头 绍兴    资格审查文件可能不需要默认与"evaluation"同一章  无效投标可能也要考虑 “more”的情况，类似工程标
 if __name__ == "__main__":
-    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件（广水市教育局封闭管理项目二次）.pdf"
-    input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
+    input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件（广水市教育局封闭管理项目二次）.pdf"
+    # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
    # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf"
    # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\ztbfile.pdf"
    # output_folder = "C:\\Users\\Administrator\\Desktop\\fsdownload\\a091d107-805d-4e28-b8b2-0c7327737238\\tmp"
    output_folder="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
    # files = truncate_pdf_multiple(input_path, output_folder)
-    # files=truncate_pdf_specific_goods(input_path,output_folder)
-    # print(files)
-    selection = 6# 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-公告
-    generated_files = truncate_pdf_main(input_path, output_folder, selection)
-    print(generated_files)
+    selections = [4, 5]
+    files=truncate_pdf_specific_goods(input_path,output_folder,selections)
+    print(files)
+    # selection = 6# 例如：1 - 商务技术服务要求, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2（与评标办法一致）  4.投标人须知前附表part1 投标人须知正文part2   5-公告
+    # generated_files = truncate_pdf_main(input_path, output_folder, selection)
+    # print(generated_files)