From 58328c2f22a537b9b71d18da96dc353d1c16322f Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Thu, 31 Oct 2024 20:12:08 +0800 Subject: [PATCH] 10.31 --- flask_app/general/merge_pdfs.py | 225 +++++++++++++++++++++- flask_app/general/读取文件/按页读取pdf.py | 3 +- flask_app/main/形式响应评审.py | 1 + flask_app/main/截取pdf.py | 192 ++++++------------ flask_app/货物标/截取pdf货物标版.py | 108 +---------- 5 files changed, 296 insertions(+), 233 deletions(-) diff --git a/flask_app/general/merge_pdfs.py b/flask_app/general/merge_pdfs.py index 1abe5d5..7e07c6a 100644 --- a/flask_app/general/merge_pdfs.py +++ b/flask_app/general/merge_pdfs.py @@ -69,4 +69,227 @@ def find_and_merge(target_path, output_suffix): paths=[target_path] else: paths=[target_path,full_path] - merge_pdfs(paths,target_path) \ No newline at end of file + merge_pdfs(paths,target_path) + +def merge_selected_pdfs_for_engineering(output_folder, truncate_files, output_path, base_file_name): + """ + 合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件, + 以及 truncate_files 中以指定后缀结尾的文件,按照指定顺序合并。 + + 参数: + - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。 + - truncate_files (list): 包含 PDF 文件路径的列表。 + - output_path (str): 合并后的 PDF 文件保存路径。 + - base_file_name (str): 用于匹配文件名的基础名称。 + + 返回: + - str: 合并后的 PDF 文件路径,如果未找到所有需要合并的文件则返回 ""。 + """ + # 1. 获取 output_folder 中所有文件 + try: + all_output_files = os.listdir(output_folder) + except FileNotFoundError: + print(f"输出文件夹 '{output_folder}' 未找到。") + return "" + except PermissionError: + print(f"没有权限访问输出文件夹 '{output_folder}'。") + return "" + + # 2. 定义要选择的文件后缀及合并顺序,包括 before 文件 + desired_suffixes = [ + f'{base_file_name}_before.pdf', + f'{base_file_name}_notice.pdf', + f'{base_file_name}_tobidders_notice_table.pdf' + ] + + all_pdfs_to_merge = [] + missing_files = [] # 用于记录缺失的文件 + + for suffix in desired_suffixes: + if suffix == f'{base_file_name}_before.pdf': + # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件 + matching_files = [ + os.path.join(output_folder, f) + for f in all_output_files + if f.endswith(suffix) + ] + else: + # 从 truncate_files 中选择以指定后缀结尾的文件 + matching_files = [f for f in truncate_files if f.endswith(suffix)] + + if matching_files: + # 如果找到多个匹配的文件,按名称排序并添加 + matching_files_sorted = sorted(matching_files) + all_pdfs_to_merge.extend(matching_files_sorted) + for f in matching_files_sorted: + print(f"选中文件: {f}") + else: + print(f"没有找到以 '{suffix}' 结尾的文件。") + missing_files.append(suffix) # 记录缺失的文件 + + # 检查是否所有需要的文件都已找到 + if missing_files: + print("缺少以下必要的 PDF 文件,无法进行合并:") + for missing in missing_files: + print(f" - {missing}") + return "" + + # 过滤掉不存在或为空的文件路径 + all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)] + + if not all_pdfs_to_merge: + print("没有找到要合并的 PDF 文件。") + return "" + + # 调用 merge_pdfs 函数进行合并 + try: + merge_pdfs(all_pdfs_to_merge, output_path) + print(f"已成功合并 PDF 文件到 '{output_path}'。") + except Exception as e: + print(f"合并 PDF 文件时出错: {e}") + return "" + + # 检查合并后的文件是否存在且不为空 + if os.path.exists(output_path) and os.path.getsize(output_path) > 0: + # 合并成功,删除 {base_file_name}_before.pdf 文件 + before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") + if os.path.exists(before_pdf_path): + try: + os.remove(before_pdf_path) + print(f"已删除文件: {before_pdf_path}") + except Exception as e: + print(f"删除文件 {before_pdf_path} 时出错: {e}") + else: + print(f"未找到要删除的文件: {before_pdf_path}") + + return output_path + else: + print(f"合并失败,没有生成 '{output_path}'。") + return "" + + +#合并封面+招标公告+投标人须知前附表+须知正文 +def merge_selected_pdfs_for_goods(output_folder, truncate_files, output_path, base_file_name): + """ + 合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件, + 以及 truncate_files 中以指定后缀结尾的文件,按照指定顺序合并。 + + 参数: + - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。 + - truncate_files (list): 包含 PDF 文件路径的列表。 + - output_path (str): 合并后的 PDF 文件保存路径。 + - base_file_name (str): 用于匹配文件名的基础名称。 + + 返回: + - str: 如果合并成功,返回 output_path;否则,返回空字符串 ""。 + """ + # 1. 获取 output_folder 中所有文件 + try: + all_output_files = os.listdir(output_folder) + except FileNotFoundError: + print(f"输出文件夹 '{output_folder}' 未找到。") + return "" + except PermissionError: + print(f"没有权限访问输出文件夹 '{output_folder}'。") + return "" + + # 2. 定义要选择的文件后缀及合并顺序 + required_suffixes = [ + f'{base_file_name}_before.pdf', + f'{base_file_name}_notice.pdf', + f'{base_file_name}_tobidders_notice_part1.pdf' + ] + + optional_suffixes = [ + f'{base_file_name}_tobidders_notice_part2.pdf' + ] + + all_pdfs_to_merge = [] + missing_files = [] # 用于记录缺失的文件 + + # 3. 处理必需的文件 + for suffix in required_suffixes: + if suffix == f'{base_file_name}_before.pdf': + # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件 + matching_files = [ + os.path.join(output_folder, f) + for f in all_output_files + if f.endswith(suffix) + ] + else: + # 从 truncate_files 中选择以指定后缀结尾的文件 + matching_files = [f for f in truncate_files if f.endswith(suffix)] + + if matching_files: + # 如果找到多个匹配的文件,按名称排序并添加 + matching_files_sorted = sorted(matching_files) + all_pdfs_to_merge.extend(matching_files_sorted) + for f in matching_files_sorted: + print(f"选中文件: {f}") + else: + print(f"没有找到以 '{suffix}' 结尾的文件。") + missing_files.append(suffix) # 记录缺失的文件 + + # 4. 处理可选的文件 + for suffix in optional_suffixes: + if suffix == f'{base_file_name}_before.pdf': + # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件 + matching_files = [ + os.path.join(output_folder, f) + for f in all_output_files + if f.endswith(suffix) + ] + else: + # 从 truncate_files 中选择以指定后缀结尾的文件 + matching_files = [f for f in truncate_files if f.endswith(suffix)] + + if matching_files: + # 如果找到多个匹配的文件,按名称排序并添加 + matching_files_sorted = sorted(matching_files) + all_pdfs_to_merge.extend(matching_files_sorted) + for f in matching_files_sorted: + print(f"选中文件: {f}") + else: + print(f"可选文件 '{suffix}' 未找到,继续合并。") + + # 5. 检查是否所有必需的文件都已找到 + if missing_files: + print("缺少以下必要的 PDF 文件,无法进行合并:") + for missing in missing_files: + print(f" - {missing}") + return "" + + print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}") + + # 6. 过滤掉不存在或为空的文件路径 + all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f) and os.path.getsize(f) > 0] + + if not all_pdfs_to_merge: + print("没有找到要合并的有效 PDF 文件。") + return "" + + # 7. 调用 merge_pdfs 函数进行合并 + try: + merge_pdfs(all_pdfs_to_merge, output_path) + print(f"已成功合并 PDF 文件到 '{output_path}'。") + except Exception as e: + print(f"合并 PDF 文件时出错: {e}") + return "" + + # 8. 检查合并后的文件是否存在且不为空 + if os.path.exists(output_path) and os.path.getsize(output_path) > 0: + # 合并成功,删除 {base_file_name}_before.pdf 文件 + before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") + if os.path.exists(before_pdf_path): + try: + os.remove(before_pdf_path) + print(f"已删除文件: {before_pdf_path}") + except Exception as e: + print(f"删除文件 {before_pdf_path} 时出错: {e}") + else: + print(f"未找到要删除的文件: {before_pdf_path}") + + return output_path + else: + print(f"合并失败,没有生成 '{output_path}'。") + return "" \ No newline at end of file diff --git a/flask_app/general/读取文件/按页读取pdf.py b/flask_app/general/读取文件/按页读取pdf.py index 6559287..1021b40 100644 --- a/flask_app/general/读取文件/按页读取pdf.py +++ b/flask_app/general/读取文件/按页读取pdf.py @@ -69,6 +69,7 @@ def extract_text_by_page(file_path): page = reader.pages[page_num] text = page.extract_text() if text: + print("-------------------") cleaned_text = clean_page_content(text,common_header) print(cleaned_text) result += cleaned_text @@ -149,7 +150,7 @@ def extract_text_by_page(file_path): if __name__ == '__main__': - file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest18.pdf" + file_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\HBDL-2024-0181-001-招标文件.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' diff --git a/flask_app/main/形式响应评审.py b/flask_app/main/形式响应评审.py index b7d3f14..b57ef17 100644 --- a/flask_app/main/形式响应评审.py +++ b/flask_app/main/形式响应评审.py @@ -321,6 +321,7 @@ def process_reviews(original_dict_data, output_folder, truncate0_jsonpath, claus updated_json = update_json_data(original_dict_data, combined_results1, combined_results2, first_response_list) return updated_json +#TODO:现在是clause truncate_jsonpath非空就尝试从中提取,但是有可能从里面提取失败,这个时候可能需要重新问大模型。 if __name__ == "__main__": start_time=time.time() # knowledge_name="zbfile" diff --git a/flask_app/main/截取pdf.py b/flask_app/main/截取pdf.py index adb41e1..0c8517f 100644 --- a/flask_app/main/截取pdf.py +++ b/flask_app/main/截取pdf.py @@ -3,7 +3,7 @@ import os import time from PyPDF2 import PdfReader, PdfWriter -from flask_app.general.merge_pdfs import merge_pdfs +from flask_app.general.merge_pdfs import merge_pdfs, merge_selected_pdfs_for_engineering import concurrent.futures import logging @@ -122,7 +122,7 @@ def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, en print(f"Error in save_pages_to_new_pdf: {e}") return "" # 返回空字符串 -def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_page,common_header): +def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_page,common_header,is_secondary_match): pdf_document = PdfReader(pdf_path) exclusion_pattern = re.compile(r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据') def run_extraction(): @@ -152,16 +152,19 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_ if chapter_type: # 根据 chapter_type 动态生成 end_pattern - end_pattern = re.compile( - rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|' # Match chapter pattern - r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line - r'^附录(?:一)?[::]|' # Match "附录一" with optional "一" and colon - r'^附件(?:一)?[::]|' # Match "附件一" with optional "一" and colon - r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon - re.MULTILINE - ) - # print(f"动态生成的 end_pattern: {end_pattern.pattern}") # 打印生成的 end_pattern - + if not is_secondary_match: + end_pattern = re.compile( + rf'^第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+|' + r'^评标办法前附表|' + r'^附录(?:一)?[::]|' + r'^附件(?:一)?[::]|' + r'^附表(?:一)?[::]', + re.MULTILINE + ) + else: + end_pattern = re.compile( + rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00 -\u9fff]+\s*$' + ) # 根据 chapter_type 动态生成 additional_mid_pattern if chapter_type == '章': additional_mid_pattern = r'^第[一二三四五六七八九十百千]+?(?:部分)' @@ -189,16 +192,20 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_ # print(f"生成的 combined_mid_pattern: {combined_mid_pattern.pattern}") # 打印 combined_mid_pattern else: # 如果未匹配到“章”或“部分”,使用默认的 end_pattern 和 mid_pattern - end_pattern = re.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|' - r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line - r'^附录(?:一)?[::]|' # Match "附录一" with optional "一" and colon - r'^附件(?:一)?[::]|' # Match "附件一" with optional "一" and colon - r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon - re.MULTILINE - ) + if not is_secondary_match: + end_pattern = re.compile( + r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+|' + r'^评标办法前附表|' # Match "评标办法前附表" at the beginning of a line + r'^附录(?:一)?[::]|' # Match "附录一" with optional "一" and colon + r'^附件(?:一)?[::]|' # Match "附件一" with optional "一" and colon + r'^附表(?:一)?[::]', # Match "附表一" with optional "一" and colon + re.MULTILINE + ) + else: + end_pattern = re.compile( + rf'第[一二三四五六七八九十百千]+?(?:{chapter_type})\s*[\u4e00-\u9fff]+\s*$' + ) # print(f"使用默认的 end_pattern: {end_pattern.pattern}") # 打印默认的 end_pattern - # 定义基础的 mid_pattern base_mid_pattern = r'^\s*(?:[((]\s*[一1]?\s*[))]\s*[、..]*|[一1][、..]+|[、..]+)\s*(说\s*明|总\s*则)' combined_mid_pattern = re.compile( @@ -212,14 +219,9 @@ def extract_pages_tobidders_notice(pdf_path, output_folder,begin_pattern, begin_ mid_page = i if start_page is not None and mid_page is not None and chapter_type: if re.search(end_pattern, cleaned_text): - if mid_page is None: - if i > start_page: - end_page = i - break - else: - if i > mid_page: - end_page = i - break + if i > mid_page: + end_page = i + break return start_page, mid_page, end_page @@ -248,11 +250,10 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter if text: cleaned_text = clean_page_content(text, common_header) if re.search(begin_pattern, cleaned_text) and i >= begin_page: - if not start_page: - if (output_suffix == "notice" or output_suffix == "invalid") and is_secondary_match: - pass # 针对 '同招标公告' 的情况 - elif output_suffix not in ["notice", "invalid"] or not is_secondary_match: - start_page = i + if start_page and (output_suffix == "notice" or output_suffix == "invalid"): + pass + else: + start_page = i if start_page is not None and re.search(end_pattern, cleaned_text): condition = i > start_page if condition: @@ -332,7 +333,14 @@ def truncate_pdf_main(input_path, output_folder, selection): re.compile( r'(?:第[一二三四五六七八九十百千]+(?:章|部分)\s*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知+|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表)'), re.compile( - r'第[一二三四五六七八九十]+章\s*评标办法|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', + r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', + re.MULTILINE) + ), + ( + re.compile( + r'.*(?:投标人?|磋商|供应商|谈判供应商|磋商供应商)须知\s*$|(?:一\s*、\s*)?(?:投标人?|磋商|供应商)须知前附表', re.MULTILINE), + re.compile( + r'第[一二三四五六七八九十]+(?:章|部分)\s*[\u4e00-\u9fff]+|^评标办法前附表|^附录(?:一)?[::]|^附件(?:一)?[::]|^附表(?:一)?[::]', re.MULTILINE) ) ] @@ -341,21 +349,28 @@ def truncate_pdf_main(input_path, output_folder, selection): # Selection 2: 评标办法 pattern_pairs = [ ( - re.compile(r'第[一二三四五六七八九十]+章\s*评标办法'), # Alternative begin pattern + re.compile(r'第[一二三四五六七八九十]+(?:章|部分)\s*评标办法'), # Alternative begin pattern re.compile(r'评标办法正文|评标办法') # Alternative end pattern ), ] output_suffix = "evaluation_method" - +#TODO:exclusion elif selection == 3: # Selection 3: 资格审查条件 pattern_pairs = [ ( - re.compile(r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$', + re.compile(r'^(?:附录(?:[一1])?[::]|附件(?:[一1])?[::]|附表(?:[一1])?[::]).*(?:资质|能力|信誉).*$|^第[一二三四五六七八九十百千]+(?:章|部分).*?(资格审查).*', re.MULTILINE), re.compile( r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*$' r'^第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE) + ), + ( + re.compile( + r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]*资格[\u4e00-\u9fff]*\s*$',re.MULTILINE), + re.compile( + r'^(?:附录[一二三四五六七八九1-9]*[::]|附件[一二三四五六七八九1-9]*[::]|附表[一二三四五六七八九1-9]*[::])(?!.*(?:资质|能力|信誉)).*|' + r'第[一二三四五六七八九1-9]+(?:章|部分)\s*[\u4e00-\u9fff]+\s*$',re.MULTILINE) ) ] output_suffix = "qualification" @@ -383,7 +398,7 @@ def truncate_pdf_main(input_path, output_folder, selection): re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE) ), ( - re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|第一卷|投标邀请)\s*$', re.MULTILINE), + re.compile(r'.*(?:招标公告|投标邀请书|投标邀请函|投标邀请)[\))]?\s*$', re.MULTILINE), re.compile(r'第[一二三四五六七八九十]+章\s*合同|[::]清标报告|^第二卷', re.MULTILINE) ) ] @@ -404,7 +419,7 @@ def truncate_pdf_main(input_path, output_folder, selection): }.get(selection, 0) if selection == 1: #投标人须知 output_paths = list( - extract_pages_tobidders_notice(input_path,output_folder, begin_pattern, begin_page, common_header)) + extract_pages_tobidders_notice(input_path,output_folder, begin_pattern, begin_page, common_header,is_secondary_match)) if output_paths and any(os.path.isfile(f) for f in output_paths): return output_paths else: @@ -495,93 +510,6 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header,las return [] -def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name): - """ - 合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件, - 以及 truncate_files 中以指定后缀结尾的文件,按照指定顺序合并。 - - 参数: - - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。 - - truncate_files (list): 包含 PDF 文件路径的列表。 - - output_path (str): 合并后的 PDF 文件保存路径。 - - base_file_name (str): 用于匹配文件名的基础名称。 - - 返回: - - str: 合并后的 PDF 文件路径,如果没有合并则返回 ""。 - """ - # 1. 获取 output_folder 中所有文件 - try: - all_output_files = os.listdir(output_folder) - except FileNotFoundError: - print(f"输出文件夹 '{output_folder}' 未找到。") - return "" - except PermissionError: - print(f"没有权限访问输出文件夹 '{output_folder}'。") - return "" - - # 2. 定义要选择的文件后缀及合并顺序,包括 before 文件 - desired_suffixes = [ - f'{base_file_name}_before.pdf', - f'{base_file_name}_notice.pdf', - f'{base_file_name}_tobidders_notice_table.pdf' - ] - - all_pdfs_to_merge = [] - - for suffix in desired_suffixes: - if suffix == f'{base_file_name}_before.pdf': - # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件 - matching_files = [ - os.path.join(output_folder, f) - for f in all_output_files - if f.endswith(suffix) - ] - else: - # 从 truncate_files 中选择以指定后缀结尾的文件 - matching_files = [f for f in truncate_files if f.endswith(suffix)] - - if matching_files: - # 如果找到多个匹配的文件,按名称排序并添加 - matching_files_sorted = sorted(matching_files) - all_pdfs_to_merge.extend(matching_files_sorted) - for f in matching_files_sorted: - print(f"选中文件: {f}") - else: - print(f"没有找到以 '{suffix}' 结尾的文件。") - - # 过滤掉不存在或为空的文件路径 - all_pdfs_to_merge = [f for f in all_pdfs_to_merge if os.path.isfile(f)] - - if not all_pdfs_to_merge: - print("没有找到要合并的 PDF 文件。") - return "" - - # 调用 merge_pdfs 函数进行合并 - try: - merge_pdfs(all_pdfs_to_merge, output_path) - print(f"已成功合并 PDF 文件到 '{output_path}'。") - except Exception as e: - print(f"合并 PDF 文件时出错: {e}") - return "" - - # 检查合并后的文件是否存在且不为空 - if os.path.exists(output_path) and os.path.getsize(output_path) > 0: - # 合并成功,删除 {base_file_name}_before.pdf 文件 - before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") - if os.path.exists(before_pdf_path): - try: - os.remove(before_pdf_path) - print(f"已删除文件: {before_pdf_path}") - except Exception as e: - print(f"删除文件 {before_pdf_path} 时出错: {e}") - else: - print(f"未找到要删除的文件: {before_pdf_path}") - - return output_path - else: - print(f"合并失败,没有生成 '{output_path}'。") - return "" - def truncate_pdf_multiple(input_path, output_folder, unique_id="123"): global logger logger = get_global_logger(unique_id) @@ -615,7 +543,7 @@ def truncate_pdf_multiple(input_path, output_folder, unique_id="123"): # Proceed with merging logic as before... if any(f for f in truncate_files if f): # 检查是否有有效的文件路径 merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") - merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name) + merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path, base_file_name) if merged_result: truncate_files.append(merged_result) logger.info(f"merged_baseinfo: 已生成合并文件: {merged_output_path}") @@ -662,7 +590,7 @@ def truncate_pdf_specific_engineering(pdf_path, output_folder, selections, uniqu # Proceed with merging logic as before... if any(f for f in truncate_files if f): # 检查是否有有效的文件路径 merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_specific.pdf") - merged_result = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name) + merged_result = merge_selected_pdfs_for_engineering(output_folder, truncate_files, merged_output_path, base_file_name) if merged_result: truncate_files.append(merged_result) logger.info(f"merged_specific: 已生成合并文件: {merged_output_path}") @@ -689,15 +617,15 @@ if __name__ == "__main__": input_path = "C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\HBDL-2024-0181-001-招标文件.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\68549b0b-e892-41a9-897c-c3694535ee61\\ztbfile.pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\2-招标文件.pdf" - # input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹" + # input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" output_folder="C:\\Users\\Administrator\\Desktop\\new招标文件\\招标文件\\tmp" # files=truncate_pdf_multiple(input_path,output_folder) # selections = [5, 1] # 仅处理 selection 5、1 # files=truncate_pdf_specific_engineering(input_path,output_folder,selections) # print(files) - selection = 5 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 + selection = 3 # 例如:1 - 投标人须知前附表+正文, 2 - 评标办法, 3 -资格审查条件 4-招标公告 5-无效标 generated_files = truncate_pdf_main(input_path, output_folder, selection) - print("生成的文件:", generated_files) + # print("生成的文件:", generated_files) end_time=time.time() print("耗时:"+str(end_time-start_time)) \ No newline at end of file diff --git a/flask_app/货物标/截取pdf货物标版.py b/flask_app/货物标/截取pdf货物标版.py index 3bba4c8..db4b672 100644 --- a/flask_app/货物标/截取pdf货物标版.py +++ b/flask_app/货物标/截取pdf货物标版.py @@ -4,7 +4,7 @@ from PyPDF2 import PdfReader, PdfWriter import re # 导入正则表达式库 import os # 用于文件和文件夹操作 from flask_app.general.format_change import docx2pdf -from flask_app.general.merge_pdfs import merge_and_cleanup,merge_pdfs +from flask_app.general.merge_pdfs import merge_and_cleanup, merge_pdfs, merge_selected_pdfs_for_goods import concurrent.futures def get_global_logger(unique_id): if unique_id is None: @@ -367,14 +367,9 @@ def extract_pages_tobidders_notice(pdf_document, begin_pattern, begin_page, comm mid_page = i if start_page is not None and mid_page is not None and chapter_type: if re.search(end_pattern, cleaned_text): - if mid_page is None: - if i > start_page: - end_page = i - break - else: - if i > mid_page: - end_page = i - break + if i > mid_page: + end_page = i + break return start_page, mid_page, end_page @@ -536,91 +531,6 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo print(f"Error in save_extracted_pages: {e}") return "" # 返回空字符串 - -#合并封面+招标公告+投标人须知前附表+须知正文 -def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_name): - """ - 合并 output_folder 中以 {base_file_name}_before.pdf 结尾的 PDF 文件, - 以及 truncate_files 中以指定后缀结尾的文件,按照指定顺序合并。 - - 参数: - - output_folder (str): 包含以 {base_file_name}_before.pdf 结尾的 PDF 文件的文件夹路径。 - - truncate_files (list): 包含 PDF 文件路径的列表。 - - output_path (str): 合并后的 PDF 文件保存路径。 - - base_file_name (str): 用于匹配文件名的基础名称。 - - 返回: - - str: 如果合并成功,返回 output_path;否则,返回空字符串 ""。 - """ - # 1. 获取 output_folder 中所有文件 - try: - all_output_files = os.listdir(output_folder) - except FileNotFoundError: - print(f"输出文件夹 '{output_folder}' 未找到。") - return "" - except PermissionError: - print(f"没有权限访问输出文件夹 '{output_folder}'。") - return "" - - # 2. 定义要选择的文件后缀及合并顺序,包括 before 文件 - desired_suffixes = [ - f'{base_file_name}_before.pdf', - f'{base_file_name}_notice.pdf', - f'{base_file_name}_tobidders_notice_part1.pdf', - f'{base_file_name}_tobidders_notice_part2.pdf' - ] - - all_pdfs_to_merge = [] - - for suffix in desired_suffixes: - if suffix == f'{base_file_name}_before.pdf': - # 从 output_folder 中选择以 {base_file_name}_before.pdf 结尾的文件 - matching_files = [ - os.path.join(output_folder, f) - for f in all_output_files - if f.endswith(suffix) - ] - else: - # 从 truncate_files 中选择以指定后缀结尾的文件 - matching_files = [f for f in truncate_files if f.endswith(suffix)] - - if matching_files: - # 如果找到多个匹配的文件,按名称排序并添加 - matching_files_sorted = sorted(matching_files) - all_pdfs_to_merge.extend(matching_files_sorted) - for f in matching_files_sorted: - print(f"选中文件: {f}") - else: - print(f"没有找到以 '{suffix}' 结尾的文件。") - - print(f"总共将要合并的 PDF 文件数量: {len(all_pdfs_to_merge)}") - - if not all_pdfs_to_merge: - print("没有找到要合并的 PDF 文件。") - return "" - - # 调用 merge_pdfs 函数进行合并 - merge_pdfs(all_pdfs_to_merge, output_path) - print(f"已成功合并 PDF 文件到 '{output_path}'。") - - # 检查合并后的文件是否存在且不为空 - if os.path.exists(output_path) and os.path.getsize(output_path) > 0: - # 合并成功,删除 {base_file_name}_before.pdf 文件 - before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") - if os.path.exists(before_pdf_path): - try: - os.remove(before_pdf_path) - print(f"已删除文件: {before_pdf_path}") - except Exception as e: - print(f"删除文件 {before_pdf_path} 时出错: {e}") - else: - print(f"未找到要删除的文件: {before_pdf_path}") - - return output_path - else: - print(f"合并失败,没有生成 '{output_path}'。") - return "" - def get_start_and_common_header(input_path): common_header = extract_common_header(input_path) last_begin_index = 0 @@ -756,7 +666,7 @@ def truncate_pdf_multiple(pdf_path, output_folder,unique_id="123"): # 定义合并后的输出路径 merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") # 调用 merge_selected_pdfs 并获取返回值 - merged_path = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name) + merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name) if merged_path: # 合并成功,添加合并后的文件路径 truncate_files.append(merged_path) @@ -806,7 +716,7 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1 # 定义合并后的输出路径 merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") # 调用 merge_selected_pdfs 并获取返回值 - merged_path = merge_selected_pdfs(output_folder, truncate_files, merged_output_path, base_file_name) + merged_path = merge_selected_pdfs_for_goods(output_folder, truncate_files, merged_output_path, base_file_name) if merged_path: # 合并成功,添加合并后的文件路径 truncate_files.append(merged_path) @@ -819,9 +729,9 @@ def truncate_pdf_specific_goods(pdf_path, output_folder, selections,unique_id="1 return truncate_files -# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 +# TODO:交通智能系统和招标(1)(1)文件有问题 包头 绍兴 资格审查文件可能不需要默认与"evaluation"同一章 无效投标可能也要考虑 “more”的情况,类似工程标 唐山投标只有正文,没有附表 if __name__ == "__main__": - input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\zbtest4_evaluation_method.pdf" + input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\唐山市公安交通警察支队机动车查验机构视频存储回放系统竞争性谈判-招标文件正文(1).pdf" # input_path = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.pdf" # input_path="C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" # input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\output1\\2-招标文件_procurement.pdf" @@ -832,6 +742,6 @@ if __name__ == "__main__": # selections = [1,4] # files=truncate_pdf_specific_goods(input_path,output_folder,selections) print(files) - # selection = 1# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 + # selection = 4# 例如:1 - 公告, 2 - 评标办法, 3 - 资格审查后缀有qualification1或qualification2(与评标办法一致) 4.投标人须知前附表part1 投标人须知正文part2 5-采购需求 # generated_files = truncate_pdf_main(input_path, output_folder, selection) # print(generated_files) \ No newline at end of file