From cb153889a4363eb7cc7743969c235746a76efc24 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Tue, 17 Dec 2024 18:44:58 +0800 Subject: [PATCH] =?UTF-8?q?12.17=20=E6=97=A0=E6=95=88=E6=8A=95=E6=A0=87?= =?UTF-8?q?=E3=80=81=E5=BA=9F=E6=A0=87?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/merge_pdfs.py | 202 ++++---------- flask_app/general/截取pdf_main.py | 127 +++++++++ flask_app/general/截取pdf通用函数.py | 57 ++++ flask_app/general/通用功能函数.py | 11 +- flask_app/old_version/形式响应评审old.py | 2 +- flask_app/old_version/招标文件解析_old.py | 2 +- flask_app/old_version/解析old_old.py | 2 +- flask_app/routes/偏离表main.py | 17 +- flask_app/routes/小解析main.py | 26 +- flask_app/routes/工程标解析main.py | 28 +- flask_app/routes/货物标解析main.py | 13 +- .../工程标/{截取pdf.py => 截取pdf工程标版.py} | 257 ++++-------------- flask_app/货物标/截取pdf货物标版.py | 230 ++-------------- 13 files changed, 357 insertions(+), 617 deletions(-) create mode 100644 flask_app/general/截取pdf_main.py create mode 100644 flask_app/general/截取pdf通用函数.py rename flask_app/工程标/{截取pdf.py => 截取pdf工程标版.py} (69%) diff --git a/flask_app/general/merge_pdfs.py b/flask_app/general/merge_pdfs.py index 513b875..c6db9e9 100644 --- a/flask_app/general/merge_pdfs.py +++ b/flask_app/general/merge_pdfs.py @@ -91,21 +91,20 @@ def find_and_merge(target_path, output_suffix): paths=[target_path,full_path] merge_pdfs(paths,target_path) -def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_path, base_file_name): +def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name, mode='engineering'): """ - 合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件, - 以及 truncate_files 中以指定后缀结尾的文件,按照指定顺序合并。 + 通用的 PDF 合并函数,根据不同的模式合并指定的文件。 参数: - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。 - truncate_files (list): 包含 PDF 文件路径的列表。 - output_path (str): 合并后的 PDF 文件保存路径。 - base_file_name (str): 用于匹配文件名的基础名称。 + - mode (str): 合并模式,支持 'engineering' 和 'goods'。 返回: - - str: 合并后的 PDF 文件路径,如果未找到所有需要合并的文件则返回 ""。 + - str: 如果合并成功,返回 output_path;否则,返回空字符串 ""。 """ - # 1. 获取 output_folder 中所有文件 try: all_output_files = os.listdir(output_folder) except FileNotFoundError: @@ -115,50 +114,82 @@ def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_pa print(f"没有权限访问输出文件夹 '{output_folder}'。") return "" - # 2. 定义要选择的文件后缀及合并顺序,包括 before 文件 - desired_suffixes = [ - f'{base_file_name}_before.pdf', - f'{base_file_name}_notice.pdf', - f'{base_file_name}_tobidders_notice_table.pdf' - ] + if mode == 'engineering': + required_suffixes = [ + f'{base_file_name}_before.pdf', + f'{base_file_name}_notice.pdf', + f'{base_file_name}_tobidders_notice_table.pdf' + ] + optional_suffixes = [] + elif mode == 'goods': + required_suffixes = [ + f'{base_file_name}_before.pdf', + f'{base_file_name}_notice.pdf', + f'{base_file_name}_tobidders_notice_part1.pdf' + ] + optional_suffixes = [ + f'{base_file_name}_tobidders_notice_part2.pdf' + ] + else: + print(f"未知的合并模式: {mode}") + return "" all_pdfs_to_merge = [] - missing_files = [] # 用于记录缺失的文件 + missing_files = [] - for suffix in desired_suffixes: + # 处理必需的文件 + for suffix in required_suffixes: if suffix == f'{base_file_name}_before.pdf': - # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件 matching_files = [ os.path.join(output_folder, f) for f in all_output_files if f.endswith(suffix) ] else: - # 从 truncate_files 中选择以指定后缀结尾的文件 matching_files = [f for f in truncate_files if f.endswith(suffix)] if matching_files: - # 如果找到多个匹配的文件,按名称排序并添加 matching_files_sorted = sorted(matching_files) all_pdfs_to_merge.extend(matching_files_sorted) for f in matching_files_sorted: print(f"选中文件: {f}") else: print(f"没有找到以 '{suffix}' 结尾的文件。") - missing_files.append(suffix) # 记录缺失的文件 + missing_files.append(suffix) - # 检查是否所有需要的文件都已找到 + # 处理可选的文件 + for suffix in optional_suffixes: + if suffix == f'{base_file_name}_before.pdf': + matching_files = [ + os.path.join(output_folder, f) + for f in all_output_files + if f.endswith(suffix) + ] + else: + matching_files = [f for f in truncate_files if f.endswith(suffix)] + + if matching_files: + matching_files_sorted = sorted(matching_files) + all_pdfs_to_merge.extend(matching_files_sorted) + for f in matching_files_sorted: + print(f"选中文件: {f}") + else: + print(f"可选文件 '{suffix}' 未找到,继续合并。") + + # 检查是否所有必需的文件都已找到 if missing_files: print("缺少以下必要的 PDF 文件,无法进行合并:") for missing in missing_files: print(f" - {missing}") return "" + print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}") + # 过滤掉不存在或为空的文件路径 - all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)] + all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0] if not all_pdfs_to_merge: - print("没有找到要合并的 PDF 文件。") + print("没有找到要合并的有效 PDF 文件。") return "" # 调用 merge_pdfs 函数进行合并 @@ -171,134 +202,6 @@ def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_pa # 检查合并后的文件是否存在且不为空 if os.path.exists(output_path) and os.path.getsize(output_path) > 0: - # 合并成功,删除 {base_file_name}_before.pdf 文件 - before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") - if os.path.exists(before_pdf_path): - try: - os.remove(before_pdf_path) - # print(f"已删除文件: {before_pdf_path}") - except Exception as e: - print(f"删除文件 {before_pdf_path} 时出错: {e}") - else: - print(f"未找到要删除的文件: {before_pdf_path}") - - return output_path - else: - print(f"合并失败,没有生成 '{output_path}'。") - return "" - - -#合并封面+招标公告+投标人须知前附表+须知正文 -def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, base_file_name): - """ - 合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件, - 以及 truncate_files 中以指定后缀结尾的文件,按照指定顺序合并。 - - 参数: - - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。 - - truncate_files (list): 包含 PDF 文件路径的列表。 - - output_path (str): 合并后的 PDF 文件保存路径。 - - base_file_name (str): 用于匹配文件名的基础名称。 - - 返回: - - str: 如果合并成功,返回 output_path;否则,返回空字符串 ""。 - """ - # 1. 获取 output_folder 中所有文件 - try: - all_output_files = os.listdir(output_folder) - except FileNotFoundError: - print(f"输出文件夹 '{output_folder}' 未找到。") - return "" - except PermissionError: - print(f"没有权限访问输出文件夹 '{output_folder}'。") - return "" - - # 2. 定义要选择的文件后缀及合并顺序 - required_suffixes = [ - f'{base_file_name}_before.pdf', - f'{base_file_name}_notice.pdf', - f'{base_file_name}_tobidders_notice_part1.pdf' - ] - - optional_suffixes = [ - f'{base_file_name}_tobidders_notice_part2.pdf' - ] - - all_pdfs_to_merge = [] - missing_files = [] # 用于记录缺失的文件 - - # 3. 处理必需的文件 - for suffix in required_suffixes: - if suffix == f'{base_file_name}_before.pdf': - # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件 - matching_files = [ - os.path.join(output_folder, f) - for f in all_output_files - if f.endswith(suffix) - ] - else: - # 从 truncate_files 中选择以指定后缀结尾的文件 - matching_files = [f for f in truncate_files if f.endswith(suffix)] - - if matching_files: - # 如果找到多个匹配的文件,按名称排序并添加 - matching_files_sorted = sorted(matching_files) - all_pdfs_to_merge.extend(matching_files_sorted) - # for f in matching_files_sorted: - # print(f"选中文件: {f}") - else: - print(f"没有找到以 '{suffix}' 结尾的文件。") - missing_files.append(suffix) # 记录缺失的文件 - - # 4. 处理可选的文件 - for suffix in optional_suffixes: - if suffix == f'{base_file_name}_before.pdf': - # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件 - matching_files = [ - os.path.join(output_folder, f) - for f in all_output_files - if f.endswith(suffix) - ] - else: - # 从 truncate_files 中选择以指定后缀结尾的文件 - matching_files = [f for f in truncate_files if f.endswith(suffix)] - - if matching_files: - # 如果找到多个匹配的文件,按名称排序并添加 - matching_files_sorted = sorted(matching_files) - all_pdfs_to_merge.extend(matching_files_sorted) - for f in matching_files_sorted: - print(f"选中文件: {f}") - else: - print(f"可选文件 '{suffix}' 未找到,继续合并。") - - # 5. 检查是否所有必需的文件都已找到 - if missing_files: - print("缺少以下必要的 PDF 文件,无法进行合并:") - for missing in missing_files: - print(f" - {missing}") - return "" - - print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}") - - # 6. 过滤掉不存在或为空的文件路径 - all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0] - - if not all_pdfs_to_merge: - print("没有找到要合并的有效 PDF 文件。") - return "" - - # 7. 调用 merge_pdfs 函数进行合并 - try: - merge_pdfs(all_pdfs_to_merge, output_path) - print(f"已成功合并 PDF 文件到 '{output_path}'。") - except Exception as e: - print(f"合并 PDF 文件时出错: {e}") - return "" - - # 8. 检查合并后的文件是否存在且不为空 - if os.path.exists(output_path) and os.path.getsize(output_path) > 0: - # 合并成功,删除 {base_file_name}_before.pdf 文件 before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") if os.path.exists(before_pdf_path): try: @@ -308,8 +211,7 @@ def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, ba print(f"删除文件 {before_pdf_path} 时出错: {e}") else: print(f"未找到要删除的文件: {before_pdf_path}") - return output_path else: print(f"合并失败,没有生成 '{output_path}'。") - return "" \ No newline at end of file + return "" diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py new file mode 100644 index 0000000..b435d72 --- /dev/null +++ b/flask_app/general/截取pdf_main.py @@ -0,0 +1,127 @@ +import concurrent.futures +import os +import time + +from flask_app.general.merge_pdfs import merge_selected_pdfs +from flask_app.general.截取pdf通用函数 import check_pdf_pages +from flask_app.general.通用功能函数 import get_global_logger +from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main_engineering +from flask_app.货物标.截取pdf货物标版 import truncate_pdf_main_goods + + +def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selections=None): + """ + 统一的 PDF 截取和合并函数,支持 'goods' 和 'engineering' 两种模式, + 以及默认的全选和指定选择。 + + 参数: + - pdf_path (str): 输入的 PDF 文件路径。 + - output_folder (str): 输出文件夹路径。 + - logger (Logger): 日志记录器。 + - mode (str, 可选): 模式类型,支持 'goods' 和 'engineering'。默认为 'engineering'。 + - selections (List[int], 可选): 选择的模式列表,默认为 [1, 2, 3, 4, 5]。 + + 返回: + - list: 截取和合并后的文件路径列表,如果截取或合并失败,则包含空字符串。 + """ + if selections is None: + selections = [1, 2, 3, 4, 5] + + # 确认模式有效 + if mode not in ['goods', 'engineering']: + raise ValueError("mode 参数必须是 'goods' 或 'engineering'") + def handle_exception(selection): + return ["", ""] if selection == 4 else [""] + # 设置模式相关的参数和函数 + if mode == 'goods': + truncate_function = truncate_pdf_main_goods + merge_mode = 'goods' + + # 根据 'goods' 模式定义异常处理的逻辑 + else: # mode == 'engineering' + truncate_function = truncate_pdf_main_engineering + merge_mode = 'engineering' + + mode_flag = 1 + num_selections = len(selections) + if num_selections < 5: + mode_flag = 2 + + res = check_pdf_pages(pdf_path, mode_flag, logger) + if res is not None: + return res # 返回包含空字符串的列表 + + base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名 + truncate_files = [] + + # 使用 ThreadPoolExecutor 进行多线程处理 + with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor: + # 提交所有任务并保持 selection 顺序 + future_to_selection = { + selection: executor.submit( + truncate_function, + pdf_path, + output_folder, + selection, + logger + # 如果 'goods' 模式需要额外参数,如 output_suffix,可以在这里处理 + ) + for selection in selections + } + + # 按 selection 顺序收集结果 + for selection in selections: + future = future_to_selection.get(selection) + try: + files = future.result() + if files: + truncate_files.extend(files) + # 无需额外处理,因为 `truncate_function` 已处理空字符串的添加和日志记录 + except Exception as e: + logger.error(f"Selection {selection} 生成了一个异常: {e}") + # 根据模式和 selection 添加相应数量的空字符串 + truncate_files.extend(handle_exception(selection)) + + # 定义合并后的输出路径 + merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") + + # 调用 merge_selected_pdfs 并获取返回值 + merged_path = merge_selected_pdfs( + output_folder, + truncate_files, + merged_output_path, + base_file_name, + mode=merge_mode + ) + + if merged_path: + # 合并成功,添加合并后的文件路径 + truncate_files.append(merged_path) + logger.info(f"已成功合并 PDF 文件到 '{merged_output_path}'。") + else: + # 合并失败,添加空字符串 + truncate_files.append("") + logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}") + + logger.info("已截取文件路径: " + str(truncate_files)) + return truncate_files + +if __name__ == "__main__": + logger=get_global_logger("123") + start_time = time.time() + # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标" + pdf_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest5.pdf" + + # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf" + # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" + output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp" + # selections = [1, 4] # 仅处理 selection 4、1 + selections=[5] + files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections) + print(files) + # selection = 1 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标 + # generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger) + # print(generated_files) + # print("生成的文件:", generated_files) + end_time = time.time() + print("耗时:" + str(end_time - start_time)) \ No newline at end of file diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py new file mode 100644 index 0000000..b00f8f6 --- /dev/null +++ b/flask_app/general/截取pdf通用函数.py @@ -0,0 +1,57 @@ +#flask_app/general/截取pdf通用函数.py +import concurrent.futures +import os + +import regex +from PyPDF2 import PdfReader + +from flask_app.general.clean_pdf import extract_common_header, clean_page_content + +def get_start_and_common_header(input_path,end_page): + common_header = extract_common_header(input_path) + last_begin_index = 0 + begin_pattern = regex.compile( + r'.*(? end_page: + return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0 + text = page.extract_text() + if text: + cleaned_text = clean_page_content(text, common_header) + # 检查是否存在"目录" + if catalog_pattern.search(cleaned_text): + continue # 如果存在目录,跳过当前页面 + if begin_pattern.search(cleaned_text): + last_begin_index = i # 更新第一个匹配的索引,页码从0开始 + return common_header, last_begin_index + return common_header, last_begin_index + +def check_pdf_pages(pdf_path, mode,logger): + try: + reader = PdfReader(pdf_path) + num_pages = len(reader.pages) + logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}") + if num_pages <= 50: + logger.info("PDF页数小于或等于50页,跳过切分逻辑。") + if mode==1: + return ['', '', '', '', '', '', ''] + else: + return ['','','',''] + # 若页数大于50页,返回None表示继续处理 + return None + except Exception as e: + logger.error(f"无法读取 PDF 页数: {e}") + # 返回空列表意味着无法执行后续处理逻辑 + if mode == 1: + return ['', '', '', '', '', '', ''] + else: + return ['', '', '', ''] + + + diff --git a/flask_app/general/通用功能函数.py b/flask_app/general/通用功能函数.py index 9e12881..3edc369 100644 --- a/flask_app/general/通用功能函数.py +++ b/flask_app/general/通用功能函数.py @@ -1,12 +1,9 @@ # -*- encoding:utf-8 -*- import ast -import json +import logging import re -from collections import OrderedDict - from flask_app.general.json_utils import clean_json_string from flask_app.general.多线程提问 import multi_threading -from flask_app.general.通义千问long import upload_file from flask_app.工程标.判断是否分包等 import read_questions_from_judge def process_judge_questions(judge_file_path, chosen_numbers, file_id, baseinfo_list1): @@ -72,3 +69,9 @@ def process_string_list(string_list): # 如果没有匹配到内容,返回空列表 return [] +def get_global_logger(unique_id): + if unique_id is None: + return logging.getLogger() # 获取默认的日志器 + logger = logging.getLogger(unique_id) + return logger + diff --git a/flask_app/old_version/形式响应评审old.py b/flask_app/old_version/形式响应评审old.py index 94ef52c..7101cc8 100644 --- a/flask_app/old_version/形式响应评审old.py +++ b/flask_app/old_version/形式响应评审old.py @@ -6,7 +6,7 @@ import time from flask_app.general.多线程提问 import multi_threading from flask_app.工程标.根据条款号整合json import process_and_merge_entries,process_and_merge2 from flask_app.general.json_utils import extract_content_from_json -from flask_app.工程标.截取pdf import truncate_pdf_main +from flask_app.工程标.截取pdf工程标版 import truncate_pdf_main from flask_app.工程标.提取json工程标版 import convert_clause_to_json prompt = """ # 角色 diff --git a/flask_app/old_version/招标文件解析_old.py b/flask_app/old_version/招标文件解析_old.py index 52ea4b2..2ef6fd8 100644 --- a/flask_app/old_version/招标文件解析_old.py +++ b/flask_app/old_version/招标文件解析_old.py @@ -3,7 +3,7 @@ import json import logging import time from concurrent.futures import ThreadPoolExecutor -from flask_app.工程标.截取pdf import truncate_pdf_multiple +from flask_app.工程标.截取pdf工程标版 import truncate_pdf_multiple from flask_app.general.table_content_extraction import extract_tables_main from flask_app.old_version.文档理解大模型版知识库处理.知识库操作_old import addfileToKnowledge, deleteKnowledge from flask_app.工程标.提取json工程标版 import convert_clause_to_json diff --git a/flask_app/old_version/解析old_old.py b/flask_app/old_version/解析old_old.py index 368f12c..e34424a 100644 --- a/flask_app/old_version/解析old_old.py +++ b/flask_app/old_version/解析old_old.py @@ -3,7 +3,7 @@ import json import logging import time from concurrent.futures import ThreadPoolExecutor -from flask_app.工程标.截取pdf import truncate_pdf_multiple +from flask_app.工程标.截取pdf工程标版 import truncate_pdf_multiple from flask_app.general.table_content_extraction import extract_tables_main from flask_app.工程标.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values diff --git a/flask_app/routes/偏离表main.py b/flask_app/routes/偏离表main.py index 5f2dc14..5491613 100644 --- a/flask_app/routes/偏离表main.py +++ b/flask_app/routes/偏离表main.py @@ -1,27 +1,16 @@ import json -import logging -import re -import string import time from flask_app.general.doubao import doubao_model from flask_app.general.format_change import pdf2docx, docx2pdf,doc2docx -from flask_app.general.insert_del_pagemark import insert_mark from flask_app.general.json_utils import clean_json_string -from flask_app.general.通义千问long import upload_file -from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods +from flask_app.general.通用功能函数 import get_global_logger +from flask_app.general.截取pdf_main import truncate_pdf_multiple from flask_app.货物标.提取采购需求main import fetch_procurement_reqs from flask_app.货物标.技术参数要求提取后处理函数 import extract_matching_keys from flask_app.货物标.资格审查main import combine_qualification_review import concurrent.futures -def get_global_logger(unique_id): - if unique_id is None: - return logging.getLogger() # 获取默认的日志器 - logger = logging.getLogger(unique_id) - return logger - - logger = None def get_nested(dic, keys, default=None): for key in keys: @@ -367,7 +356,7 @@ def get_tech_and_business_deviation(file_path,file_type,unique_id,output_folder) logger.error("Unsupported file type provided. Preprocessing halted.") return None selections=[1,3,5] - files=truncate_pdf_specific_goods(pdf_path,output_folder,selections,unique_id) + files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections) notice_path=files[0] qualification_file=files[1] procurement_file=files[2] diff --git a/flask_app/routes/小解析main.py b/flask_app/routes/小解析main.py index 4106b0b..bd8dd1e 100644 --- a/flask_app/routes/小解析main.py +++ b/flask_app/routes/小解析main.py @@ -8,21 +8,14 @@ from flask_app.general.format_change import docx2pdf from flask_app.general.json_utils import clean_json_string from flask_app.general.多线程提问 import read_questions_from_file, multi_threading from flask_app.general.通义千问long import upload_file +from flask_app.general.通用功能函数 import get_global_logger from flask_app.货物标.基础信息解析main import aggregate_basic_info_goods -from flask_app.货物标.截取pdf货物标版 import truncate_pdf_specific_goods -from flask_app.工程标.截取pdf import truncate_pdf_specific_engineering,truncate_pdf_main +from flask_app.general.截取pdf_main import truncate_pdf_multiple from flask_app.general.post_processing import inner_post_processing from flask_app.工程标.基础信息整合工程标 import aggregate_basic_info_engineering - -def get_global_logger(unique_id): - if unique_id is None: - return logging.getLogger() # 获取默认的日志器 - logger = logging.getLogger(unique_id) - return logger - #货物标 -def little_parse_goods(output_folder, pdf_path): +def little_parse_goods(output_folder, pdf_path,logger): """ 解析货物相关的基础信息。 @@ -35,7 +28,7 @@ def little_parse_goods(output_folder, pdf_path): """ # 截取特定的货物 PDF 文件 selections = [1,4] # 仅处理 selection 1和4 #公告+投标人须知 - files = truncate_pdf_specific_goods(pdf_path, output_folder,selections) + files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods',selections) if not files: raise ValueError("未找到截取后的文件。") # 假设最后一个文件是需要处理的基础信息文件 @@ -58,7 +51,7 @@ def little_parse_goods(output_folder, pdf_path): return {"基础信息": aggregated_baseinfo} -def little_parse_engineering(output_folder, pdf_path): +def little_parse_engineering(output_folder, pdf_path,logger): """ 解析工程相关的基础信息。 @@ -71,13 +64,14 @@ def little_parse_engineering(output_folder, pdf_path): """ # 截取特定的工程 PDF 文件 selections = [ 1,4] #公告+投标人须知前附表 - files = truncate_pdf_specific_engineering(pdf_path, output_folder,selections) + files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering',selections) if not files: raise ValueError("未找到截取后的文件。") # 假设最后一个文件是需要处理的基础信息文件 baseinfo_file_path = files[-1] if not baseinfo_file_path: - baseinfo_file_path=truncate_pdf_main(pdf_path,output_folder,5)[0] #invalid_path + selections=[5] + baseinfo_file_path=truncate_pdf_multiple(pdf_path,output_folder,logger,selections)[0] #invalid_path # 上传文件并获取文件 ID file_id = upload_file(baseinfo_file_path) # 注意:以下路径被硬编码,确保该路径存在并且正确 @@ -123,11 +117,11 @@ def little_parse_main(output_folder, file_path, file_type,zb_type,unique_id): return None # 根据招标类型调用相应的解析函数 if zb_type == 2: # 货物标 - combined_data = little_parse_goods(output_folder, pdf_path) + combined_data = little_parse_goods(output_folder, pdf_path,logger) logger.info("Parsed goods data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4)) res = inner_post_processing(combined_data["基础信息"]) elif zb_type==1: #工程标 - combined_data = little_parse_engineering(output_folder, pdf_path) + combined_data = little_parse_engineering(output_folder, pdf_path,logger) logger.info("Parsed engineering data: %s", json.dumps(combined_data, ensure_ascii=False, indent=4)) res = inner_post_processing(combined_data["基础信息"]) # 定义保存JSON文件的路径 diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py index 4cb1d3e..fd8f231 100644 --- a/flask_app/routes/工程标解析main.py +++ b/flask_app/routes/工程标解析main.py @@ -8,8 +8,9 @@ from concurrent.futures import ThreadPoolExecutor from docx import Document from flask_app.general.insert_del_pagemark import insert_mark,delete_mark -from flask_app.工程标.截取pdf import truncate_pdf_multiple +from flask_app.general.截取pdf_main import truncate_pdf_multiple from flask_app.general.merge_pdfs import merge_pdfs +from flask_app.general.通用功能函数 import get_global_logger from flask_app.工程标.提取json工程标版 import convert_clause_to_json from flask_app.general.json_utils import transform_json_values from flask_app.general.无效标和废标公共代码 import combine_find_invalid @@ -20,16 +21,9 @@ from flask_app.工程标.资格审查模块 import combine_review_standards from flask_app.general.商务技术评分提取 import combine_evaluation_standards from flask_app.general.format_change import pdf2docx, docx2pdf - -def get_global_logger(unique_id): - if unique_id is None: - return logging.getLogger() # 获取默认的日志器 - logger = logging.getLogger(unique_id) - return logger - # 创建全局线程池 executor = ThreadPoolExecutor() -def preprocess_files(output_folder, file_path, file_type,unique_id,logger): +def preprocess_files(output_folder, file_path, file_type,logger): logger.info("starting 文件预处理...") logger.info("output_folder..." + output_folder) start_time=time.time() @@ -47,20 +41,20 @@ def preprocess_files(output_folder, file_path, file_type,unique_id,logger): logger.error("Unsupported file type provided. Preprocessing halted.") return None # 调用截取PDF多次 - truncate_files = truncate_pdf_multiple(pdf_path, output_folder,unique_id) + truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'engineering') print("切割出的文件:"+str(truncate_files)) # 处理各个部分 - tobidders_notice_table=truncate_files[0] #投标人须知前附表 + notice_path=truncate_files[0] #招标公告 - tobidders_notice = truncate_files[1] #投标人须知正文 + evaluation_method = truncate_files[1] #评标方法 - evaluation_method = truncate_files[2] #评标方法 + qualification = truncate_files[2] #资格审查 - qualification = truncate_files[3] #资格审查 - notice_path=truncate_files[4] #公告 + tobidders_notice_table = truncate_files[3] #投标人须知前附表 + tobidders_notice=truncate_files[4] #投标人须知正文 - invalid_path=truncate_files[5] + invalid_path=truncate_files[5] #无效标 # invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 # invalid_docpath=pdf2docx(invalid_path) invalid_added_pdf = insert_mark(invalid_path) @@ -215,7 +209,7 @@ def fetch_bid_opening(invalid_deleted_docx, merged_baseinfo_path_more, clause_pa def engineering_bid_main(output_folder, file_path, file_type, unique_id): logger = get_global_logger(unique_id) # 预处理文件,获取处理后的数据 - processed_data = preprocess_files(output_folder, file_path, file_type,unique_id,logger) + processed_data = preprocess_files(output_folder, file_path, file_type,logger) if not processed_data: yield json.dumps({}) # 如果处理数据失败,返回空的 JSON diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index cb4b363..6798e85 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -7,23 +7,16 @@ from docx import Document from flask_app.general.format_change import docx2pdf, pdf2docx,doc2docx from flask_app.general.insert_del_pagemark import insert_mark, delete_mark from flask_app.general.json_utils import transform_json_values +from flask_app.general.通用功能函数 import get_global_logger from flask_app.货物标.基础信息解析main import combine_basic_info from flask_app.货物标.投标人须知正文提取指定内容货物标版 import extract_from_notice -from flask_app.货物标.截取pdf货物标版 import truncate_pdf_multiple +from flask_app.general.截取pdf_main import truncate_pdf_multiple from concurrent.futures import ThreadPoolExecutor import concurrent.futures from flask_app.货物标.提取json货物标版 import convert_clause_to_json from flask_app.general.无效标和废标公共代码 import combine_find_invalid from flask_app.货物标.资格审查main import combine_qualification_review from flask_app.general.商务技术评分提取 import combine_evaluation_standards -import logging - - -def get_global_logger(unique_id): - if unique_id is None: - return logging.getLogger() # 获取默认的日志器 - logger = logging.getLogger(unique_id) - return logger # 创建全局线程池 executor = ThreadPoolExecutor() @@ -47,7 +40,7 @@ def preprocess_files(output_folder, file_path, file_type,logger): return None # 调用截取PDF多次 - truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger) # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文 + truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文 # 处理各个部分 invalid_path=pdf_path diff --git a/flask_app/工程标/截取pdf.py b/flask_app/工程标/截取pdf工程标版.py similarity index 69% rename from flask_app/工程标/截取pdf.py rename to flask_app/工程标/截取pdf工程标版.py index f4ac13e..9af7820 100644 --- a/flask_app/工程标/截取pdf.py +++ b/flask_app/工程标/截取pdf工程标版.py @@ -1,21 +1,14 @@ +#flask_app/工程标/截取pdf工程标版.py import regex import os import time from PyPDF2 import PdfReader, PdfWriter -from flask_app.general.clean_pdf import clean_page_content, extract_common_header -from flask_app.general.merge_pdfs import merge_selected_pdfs_for_engineering -import concurrent.futures -import logging +from flask_app.general.clean_pdf import clean_page_content + +from flask_app.general.截取pdf通用函数 import get_start_and_common_header +from flask_app.general.通用功能函数 import get_global_logger -def get_global_logger(unique_id): - if unique_id is None: - return logging.getLogger() # 获取默认的日志器 - logger = logging.getLogger(unique_id) - return logger - - -logger = None def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header): try: # 获取文件基本名称 @@ -301,67 +294,39 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l return [] -def get_start_and_common_header(input_path): - common_header = extract_common_header(input_path) - last_begin_index = 0 - begin_pattern = regex.compile( - r'.*(? 10: - return common_header, 0 # 如果页码大于10,直接返回last_begin_index=0 - text = page.extract_text() - if text: - cleaned_text = clean_page_content(text, common_header) - # 检查是否存在"目录" - if catalog_pattern.search(cleaned_text): - continue # 如果存在目录,跳过当前页面 - if begin_pattern.search(cleaned_text): - last_begin_index = i # 更新第一个匹配的索引,页码从0开始 - return common_header, last_begin_index - return common_header, last_begin_index - -def truncate_pdf_main(input_path, output_folder, selection): +def truncate_pdf_main_engineering(input_path, output_folder, selection,logger): if os.path.isdir(input_path): generated_files = [] for file_name in os.listdir(input_path): if file_name.endswith('.pdf'): file_path = os.path.join(input_path, file_name) - files = truncate_pdf_main(file_path, output_folder, selection) + files = truncate_pdf_main_engineering(file_path, output_folder, selection) if files: generated_files.extend(files) return generated_files elif os.path.isfile(input_path) and input_path.endswith('.pdf'): # base_file_name = os.path.splitext(os.path.basename(input_path))[0] - common_header, last_begin_index = get_start_and_common_header(input_path) + common_header, last_begin_index = get_start_and_common_header(input_path,20) # print(last_begin_index) if selection == 1: - # Selection 1: 投标人须知前附表 + # Selection 1: 招标公告 pattern_pairs = [ ( regex.compile( - r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知|(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知前附表)\s*$',regex.MULTILINE), - regex.compile( - r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标(方法|办法)前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', - regex.MULTILINE) + r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'), + regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE) ), ( regex.compile( - r'.*(? begin_page: -# start_page = i -# if start_page is not None and mid_page is None and regex.search( -# r'^\s*[((]?\s*[一1]\s*[))]?\s*[、..]*\s*(说\s*明|总\s*则)', cleaned_text): -# mid_page = i -# if start_page is not None and mid_page is not None and regex.search(end_pattern, cleaned_text) and i > mid_page: -# end_page = i -# break -# return start_page, mid_page, end_page - def extract_pages_tobidders_notice(pdf_path, begin_pattern, begin_page, common_header, exclusion_pattern): """ 从PDF文档中提取起始页、中间页和结束页。 @@ -510,7 +467,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b new_file.write(original_file.read()) return new_file_path else: - temp = truncate_pdf_main(pdf_path, output_folder, 2, "qualification2") + temp = truncate_pdf_main_goods(pdf_path, output_folder, 2, "qualification2") if len(temp) > 0: return temp[0] else: @@ -558,33 +515,8 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo print(f"Error in save_extracted_pages: {e}") return "" # 返回空字符串 -def get_start_and_common_header(input_path): - common_header = extract_common_header(input_path) - last_begin_index = 0 - begin_pattern = regex.compile( - r'.*(? 10: - return common_header, 0 # 如果页码大于10,直接返回last_begin_index=0 - text = page.extract_text() - if text: - cleaned_text = clean_page_content(text, common_header) - # 检查是否存在"目录" - if catalog_pattern.search(cleaned_text): - continue # 如果存在目录,跳过当前页面 - if begin_pattern.search(cleaned_text): - last_begin_index = i # 更新第一个匹配的索引,页码从0开始 - return common_header, last_begin_index - return common_header, last_begin_index - - -def truncate_pdf_main(input_path, output_folder, selection, output_suffix="default"): +def truncate_pdf_main_goods(input_path, output_folder, selection, output_suffix="default"): try: # 检查是否为文件夹 if os.path.isdir(input_path): @@ -619,7 +551,7 @@ def process_input(input_path, output_folder, selection, output_suffix): os.makedirs(output_folder) # 获取起始和通用页眉 - common_header, last_begin_index = get_start_and_common_header(input_path) + common_header, last_begin_index = get_start_and_common_header(input_path,10) begin_page = last_begin_index if last_begin_index != 0 else { 4: 1, 2: 5, @@ -676,145 +608,43 @@ def process_input(input_path, output_folder, selection, output_suffix): if output_suffix == "default": output_suffix = local_output_suffix - # 调用实际处理文件内容的函数 - return process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + # 将原先的 process_files 逻辑合并到此处 + pdf_path = convert_to_pdf(input_path) + result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) + + # 根据提取结果以及不同的 output_suffix 进行处理 + if result: + if output_suffix == "tobidders_notice": + # 确保返回的是元组,并将其中的 None 转换为 "" + path1, path2 = result + return [path1 or "", path2 or ""] + elif output_suffix == "qualification1": + merge_and_cleanup(result, "qualification3") + return [result or ""] + return [result or ""] + + return [""] except Exception as e: print(f"Error in process_input: {e}") return [''] -def truncate_pdf_multiple(pdf_path, output_folder, logger): - # 新增逻辑:检查 PDF 总页数 - try: - reader = PdfReader(pdf_path) - num_pages = len(reader.pages) - logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}") - if num_pages <= 50: - logger.info(f"PDF页数小于或等于50页,跳过切分逻辑。") - return ['', '', '', '', '', '', ''] - except Exception as e: - logger.error(f"无法读取 PDF 页数: {e}") - return ['', '', '', '', '', '', ''] - - base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] - truncate_files = [] - - # 定义要处理的选择范围 - selections = range(1, 6) - - # 使用 ThreadPoolExecutor 进行多线程处理 - with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - # 提交所有任务并保持 selection 顺序 - future_to_selection = { - selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") - for selection in selections} - - # 按 selection 顺序收集结果 - for selection in selections: - future = future_to_selection.get(selection) - try: - files = future.result() - if files: - truncate_files.extend(files) - except Exception as e: - logger.error(f"Selection {selection} 生成了一个异常: {e}") - - # 定义合并后的输出路径 - merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") - # 调用 merge_selected_pdfs 并获取返回值 - merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name) - if merged_path: - # 合并成功,添加合并后的文件路径 - truncate_files.append(merged_path) - # logger.info(f"已生成合并文件: {merged_output_path}") - else: - # 合并失败,添加空字符串 - truncate_files.append("") - logger.warning(f"合并失败,没有生成合并文件 for {pdf_path}") - logger.info("已截取文件路径" + str(truncate_files)) - return truncate_files - - -# 小解析,只需要前三章内容 -def truncate_pdf_specific_goods(pdf_path, output_folder, selections, unique_id="123"): - """ - 处理 PDF 文件,选择指定的 selections,并合并结果。 - - Args: - pdf_path (str): 要处理的 PDF 文件路径。 - output_folder (str): 截取后的文件保存文件夹路径。 - selections (iterable): 要处理的 selection 列表。 - - Returns: - list: 截取的文件路径列表,包括合并后的文件路径(如果有)。 - """ - logger = get_global_logger(unique_id) - # 新增逻辑:检查 PDF 总页数 - try: - reader = PdfReader(pdf_path) - num_pages = len(reader.pages) - logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}") - if num_pages <= 50: - logger.info(f"PDF页数小于或等于50页,跳过切分逻辑。") - return ['', '', '', ''] - except Exception as e: - logger.error(f"无法读取 PDF 页数: {e}") - return ['', '', '', ''] - base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] - truncate_files = [] - - # 使用 ThreadPoolExecutor 进行多线程处理 - with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor: - # 提交所有任务并保持 selection 顺序 - future_to_selection = { - selection: executor.submit(truncate_pdf_main, pdf_path, output_folder, selection, output_suffix="default") - for selection in selections} - - # 按 selection 顺序收集结果 - for selection in selections: - future = future_to_selection.get(selection) - try: - files = future.result() - if files: - if isinstance(files, list): - truncate_files.extend(files) - elif isinstance(files, str): - truncate_files.append(files) - except Exception as e: - logger.error(f"Selection {selection} 生成了一个异常: {e}") - - # 定义合并后的输出路径 - merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") - # 调用 merge_selected_pdfs 并获取返回值 - merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name) - if merged_path: - # 合并成功,添加合并后的文件路径 - truncate_files.append(merged_path) - # logger.info(f"已生成合并文件: {merged_output_path}") - else: - # 合并失败,添加空字符串 - truncate_files.append("") - logger.warning(f"合并失败,没有生成merged_baseinfo for {pdf_path}") - return truncate_files - - # TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 工程标中,判断是符合性审查之后,可以将它们设为同一章 # ztbfile.pdf少资格评审 包头少符合性评审 if __name__ == "__main__": logger = get_global_logger("123") # input_path = r"C:\Users\Administrator\Desktop\new招标文件\货物标" - # input_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" + pdf_path = r"C:\Users\Administrator\Desktop\招标文件-采购类\2024-贵州-贵州医科大学附属医院导视系统零星制作安装项目.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" # input_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\2-招标文件(广水市教育局封闭管理).pdf" - input_path=r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\ztbfile.pdf" + # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" output_folder = r"C:\Users\Administrator\Desktop\fsdownload\ff2acdba-3a55-48a7-aa7a-61d9f89b909a\tmp" # output_folder = r"C:\Users\Administrator\Desktop\new招标文件\output2" - # files = truncate_pdf_multiple(input_path, output_folder,logger) - # selections = [1,4] - # files=truncate_pdf_specific_goods(input_path,output_folder,selections) + # selections = [1, 4] + # files = truncate_pdf_multiple(pdf_path, output_folder,logger,selections) # print(files) - selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 - generated_files = truncate_pdf_main(input_path, output_folder, selection) - print(generated_files) + # selection = 4 # 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 + # generated_files = truncate_pdf_main_goods(pdf_path, output_folder, selection) + # print(generated_files)