From 6fb4f49b139fba210ca936b9a569673b24a815d2 Mon Sep 17 00:00:00 2001 From: zy123 <646228430@qq.com> Date: Wed, 18 Dec 2024 10:32:24 +0800 Subject: [PATCH] =?UTF-8?q?12.18=20=E6=88=AA=E5=8F=96pdf?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- flask_app/general/merge_pdfs.py | 2 +- flask_app/general/截取pdf_main.py | 16 +- flask_app/general/截取pdf通用函数.py | 74 ++++- flask_app/工程标/截取pdf工程标版.py | 462 +++++++++++++-------------- flask_app/货物标/截取pdf货物标版.py | 294 ++++++++--------- 5 files changed, 430 insertions(+), 418 deletions(-) diff --git a/flask_app/general/merge_pdfs.py b/flask_app/general/merge_pdfs.py index c6db9e9..9677e94 100644 --- a/flask_app/general/merge_pdfs.py +++ b/flask_app/general/merge_pdfs.py @@ -118,7 +118,7 @@ def merge_selected_pdfs(output_folder, truncate_files, output_path, base_file_na required_suffixes = [ f'{base_file_name}_before.pdf', f'{base_file_name}_notice.pdf', - f'{base_file_name}_tobidders_notice_table.pdf' + f'{base_file_name}_tobidders_notice_table.pdf' #tobidders_notice tobidders_notice_table ] optional_suffixes = [] elif mode == 'goods': diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py index b435d72..0edd6a7 100644 --- a/flask_app/general/截取pdf_main.py +++ b/flask_app/general/截取pdf_main.py @@ -34,20 +34,18 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection return ["", ""] if selection == 4 else [""] # 设置模式相关的参数和函数 if mode == 'goods': + logger.info("call 货物标截取pdf") truncate_function = truncate_pdf_main_goods merge_mode = 'goods' # 根据 'goods' 模式定义异常处理的逻辑 else: # mode == 'engineering' + logger.info("call 工程标标截取pdf") truncate_function = truncate_pdf_main_engineering merge_mode = 'engineering' - mode_flag = 1 num_selections = len(selections) - if num_selections < 5: - mode_flag = 2 - - res = check_pdf_pages(pdf_path, mode_flag, logger) + res = check_pdf_pages(pdf_path, logger) if res is not None: return res # 返回包含空字符串的列表 @@ -63,7 +61,8 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection pdf_path, output_folder, selection, - logger + logger, + "default" # 如果 'goods' 模式需要额外参数,如 output_suffix,可以在这里处理 ) for selection in selections @@ -115,8 +114,9 @@ if __name__ == "__main__": # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf" # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp" - # selections = [1, 4] # 仅处理 selection 4、1 - selections=[5] + selections = [1, 4] # 仅处理 selection 4、1 + # selections=[5] + #engineering files=truncate_pdf_multiple(pdf_path,output_folder,logger,'engineering',selections) print(files) # selection = 1 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标 diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index b00f8f6..19876d3 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -3,9 +3,11 @@ import concurrent.futures import os import regex -from PyPDF2 import PdfReader +from PyPDF2 import PdfReader, PdfWriter from flask_app.general.clean_pdf import extract_common_header, clean_page_content +from flask_app.general.format_change import docx2pdf + def get_start_and_common_header(input_path,end_page): common_header = extract_common_header(input_path) @@ -32,26 +34,78 @@ def get_start_and_common_header(input_path,end_page): return common_header, last_begin_index return common_header, last_begin_index -def check_pdf_pages(pdf_path, mode,logger): +def check_pdf_pages(pdf_path, logger): try: reader = PdfReader(pdf_path) num_pages = len(reader.pages) logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}") if num_pages <= 50: logger.info("PDF页数小于或等于50页,跳过切分逻辑。") - if mode==1: - return ['', '', '', '', '', '', ''] - else: - return ['','','',''] + return ['', '', '', '', '', '', ''] # 若页数大于50页,返回None表示继续处理 return None except Exception as e: logger.error(f"无法读取 PDF 页数: {e}") # 返回空列表意味着无法执行后续处理逻辑 - if mode == 1: - return ['', '', '', '', '', '', ''] - else: - return ['', '', '', ''] + return ['', '', '', '', '', '', ''] +def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header): + try: + # 检查 start_page 和 end_page 是否为 None + if start_page is None or end_page is None: + print("Error: start_page 或 end_page 为 None") + return "" + pdf_document = PdfReader(pdf_path) + total_pages = len(pdf_document.pages) + base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] + output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + + if start_page < 0 or end_page >= total_pages or start_page > end_page: + print(f"无效的页面范围: {start_page} 到 {end_page}") + return "" + + if output_suffix == 'notice' and start_page > 0: + before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") + before_doc = PdfWriter() + toc_page = -1 + # 查找目录页 + for page_num in range(min(start_page, total_pages)): + page_text = pdf_document.pages[page_num].extract_text() + cleaned_text = clean_page_content(page_text, common_header) + if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE): + toc_page = page_num + break + + # 确定截取的页数 + pages_to_extract = toc_page + 1 if toc_page != -1 else start_page + # 提取页面 + for page_num in range(pages_to_extract): + before_doc.add_page(pdf_document.pages[page_num]) + print(before_pdf_path) + with open(before_pdf_path, 'wb') as f_before: + before_doc.write(f_before) + + output_doc = PdfWriter() + for page_num in range(start_page, end_page + 1): + output_doc.add_page(pdf_document.pages[page_num]) + with open(output_pdf_path, 'wb') as f: + output_doc.write(f) + print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") + return output_pdf_path + except Exception as e: + print(f"Error in save_extracted_pages: {e}") + return "" # 返回空字符串 + +def is_pdf_or_doc(filename): + # 判断文件是否为PDF或Word文档 + return filename.lower().endswith(('.pdf', '.doc', '.docx')) + + +def convert_to_pdf(file_path): + # 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换 + if file_path.lower().endswith(('.doc', '.docx')): + return docx2pdf(file_path) + return file_path + diff --git a/flask_app/工程标/截取pdf工程标版.py b/flask_app/工程标/截取pdf工程标版.py index 9af7820..f030f86 100644 --- a/flask_app/工程标/截取pdf工程标版.py +++ b/flask_app/工程标/截取pdf工程标版.py @@ -5,66 +5,9 @@ import time from PyPDF2 import PdfReader, PdfWriter from flask_app.general.clean_pdf import clean_page_content -from flask_app.general.截取pdf通用函数 import get_start_and_common_header +from flask_app.general.截取pdf通用函数 import get_start_and_common_header,save_extracted_pages from flask_app.general.通用功能函数 import get_global_logger - -def save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header): - try: - # 获取文件基本名称 - base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] - - # 构建主要输出文件路径 - output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") - # 读取PDF文件 - pdf_document = PdfReader(pdf_path) - total_pages = len(pdf_document.pages) - - # 检查起始和结束页码是否有效 - if start_page < 0 or end_page >= total_pages or start_page > end_page: - print(f"无效的页面范围: {start_page} 到 {end_page}") - return "" - - # 如果 output_suffix 是 'notice',保存 start_page 之前的页面(若遇到'目录',则截取到'目录') - if output_suffix == 'notice' and start_page > 0: - before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") - before_doc = PdfWriter() - toc_page = -1 - # 查找目录页 - for page_num in range(min(start_page, total_pages)): - page_text = pdf_document.pages[page_num].extract_text() - cleaned_text = clean_page_content(page_text, common_header) - if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE): - toc_page = page_num - break - - # 确定截取的页数 - pages_to_extract = toc_page + 1 if toc_page != -1 else start_page - # 提取页面 - for page_num in range(pages_to_extract): - before_doc.add_page(pdf_document.pages[page_num]) - print(before_pdf_path) - with open(before_pdf_path, 'wb') as f_before: - before_doc.write(f_before) - - # print(f"已保存页面从 0 到 {pages_to_extract} 为 {before_pdf_path}") - - # 提取指定范围的页面 - output_doc = PdfWriter() - for page_num in range(start_page, end_page + 1): - output_doc.add_page(pdf_document.pages[page_num]) - - # 保存新的PDF文件 - with open(output_pdf_path, 'wb') as f_output: - output_doc.write(f_output) - print(f"{output_suffix} 已截取并保存页面从 {start_page + 1} 到 {end_page + 1} 为 {output_pdf_path}") - return output_pdf_path - - except Exception as e: - print(f"Error in save_pages_to_new_pdf: {e}") - return "" # 返回空字符串 - - def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin_page, common_header, is_secondary_match): pdf_document = PdfReader(pdf_path) @@ -75,6 +18,8 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin mid_page = None end_page = None chapter_type = None # 用于存储“章”或“部分” + combined_mid_pattern= None + end_pattern = None for i, page in enumerate(pdf_document.pages): text = page.extract_text() or "" @@ -87,7 +32,7 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin match = regex.search(begin_pattern, cleaned_text) if match and i > begin_page: start_page = i - matched_text = match.group(0) # 获取整个匹配的文本 + matched_text: str = match.group(0) # 获取整个匹配的文本 if '章' in matched_text: chapter_type = '章' elif '部分' in matched_text: @@ -175,14 +120,12 @@ def extract_pages_tobidders_notice(pdf_path, output_folder, begin_pattern, begin print(f"first: tobidders_notice 未找到起始或结束页在文件 {pdf_path} 中!尝试备用提取策略。") return "", "" - path1 = save_pages_to_new_pdf(pdf_path, output_folder, "tobidders_notice_table", start_page, mid_page, - common_header) - path2 = save_pages_to_new_pdf(pdf_path, output_folder, "tobidders_notice", mid_page, end_page, common_header) + path1 = save_extracted_pages(pdf_path, output_folder, start_page, mid_page, "tobidders_notice_table", common_header) + path2 = save_extracted_pages(pdf_path, output_folder, mid_page, end_page, "tobidders_notice", common_header) return path1, path2 -def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix, common_header, - is_secondary_match=False): +def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix, common_header): # 打开PDF文件 pdf_document = PdfReader(pdf_path) start_page = None @@ -194,7 +137,6 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) - # if is_secondary_match and regex.search(exclusion_pattern, cleaned_text): # 跳过投标人须知正文中的"投标文件的组成" if regex.search(exclusion_pattern, cleaned_text): continue if regex.search(begin_pattern, cleaned_text) and i >= begin_page: @@ -215,7 +157,7 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter print(f"{output_suffix} first: 未找到起始或结束页在文件 {pdf_path} 中!") return [""] else: - return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)] + return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)] def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, last_begin_index): @@ -277,7 +219,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l print(f"{output_suffix} twice: 未找到起始或结束页在文件 {pdf_path} 中!") return [] else: - return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)] + return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)] elif output_suffix == "invalid": pdf_document = PdfReader(pdf_path) total_pages = len(pdf_document.pages) @@ -285,7 +227,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l total = int(total_pages * 2 / 3) start_page = last_begin_index end_page = min(90, total) - return [save_pages_to_new_pdf(pdf_path, output_folder, output_suffix, start_page, end_page, common_header)] + return [save_extracted_pages(pdf_path, output_folder, start_page, end_page, output_suffix, common_header)] else: print(f"{output_suffix} twice: 未定义的输出后缀。") return [] @@ -294,174 +236,224 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, l return [] -def truncate_pdf_main_engineering(input_path, output_folder, selection,logger): - if os.path.isdir(input_path): - generated_files = [] - for file_name in os.listdir(input_path): - if file_name.endswith('.pdf'): - file_path = os.path.join(input_path, file_name) - files = truncate_pdf_main_engineering(file_path, output_folder, selection) - if files: - generated_files.extend(files) - return generated_files - elif os.path.isfile(input_path) and input_path.endswith('.pdf'): - # base_file_name = os.path.splitext(os.path.basename(input_path))[0] - common_header, last_begin_index = get_start_and_common_header(input_path,20) - # print(last_begin_index) - if selection == 1: - # Selection 1: 招标公告 - pattern_pairs = [ - ( - regex.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请'), - regex.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', regex.MULTILINE) - ), - ( - regex.compile( - r'.*(? begin_page: start_page = i - matched_text = match.group(0) # 获取整个匹配的文本 - + matched_text:str = match.group(0) # 获取整个匹配的文本 # 如果未提供固定的 end_pattern,则根据匹配的章节类型动态生成 if not local_end_pattern: if '章' in matched_text: @@ -426,7 +409,7 @@ def extract_pages_qualification(pdf_document, begin_page, common_header): return start_page, end_page -def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page): +def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, begin_page, logger): try: exclusion_pattern = regex.compile( r'文件的构成|文件的组成|须对应|需对应|须按照|需按照|须根据|需根据|文件组成|文件构成') @@ -467,7 +450,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b new_file.write(original_file.read()) return new_file_path else: - temp = truncate_pdf_main_goods(pdf_path, output_folder, 2, "qualification2") + temp = truncate_pdf_main_goods(pdf_path, output_folder, 2,logger, "qualification2") if len(temp) > 0: return temp[0] else: @@ -475,56 +458,131 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header, b else: print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") return "" - return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) + return save_extracted_pages(pdf_path, output_folder,start_page, end_page, output_suffix,common_header) except Exception as e: print(f"Error in extract_pages_twice: {e}") return "" - -def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): +def truncate_pdf_main_goods(input_path, output_folder, selection,logger, output_suffix="default"): try: - # 检查 start_page 和 end_page 是否为 None - if start_page is None or end_page is None: - print("Error: start_page 或 end_page 为 None") - return "" + # Function to handle processing of a single file + def process_single_file(input_path, output_folder, selection, output_suffix): + try: + # 创建输出文件夹 + if not os.path.exists(output_folder): + os.makedirs(output_folder) - base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] - output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") + # 获取起始和通用页眉 + common_header, last_begin_index = get_start_and_common_header(input_path, 10) + begin_page = last_begin_index if last_begin_index != 0 else { + 4: 1, + 2: 5, + 3: 5, + 1: 0, + 5: 3, + 6: 0 # Added default for selection 6 if needed + }.get(selection, 0) - if start_page < 0 or end_page >= len(pdf_document.pages) or start_page > end_page: - print(f"无效的页面范围: {start_page} 到 {end_page}") - return "" + # 根据选择设置对应的模式和结束模式 + if selection == 1: + begin_pattern = regex.compile( + r'.*(?= 0: - before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") - before_doc = PdfWriter() - for page_num in range(0, start_page): - before_doc.add_page(pdf_document.pages[page_num]) - with open(before_pdf_path, 'wb') as f: - before_doc.write(f) - # print(f"已保存页面从 0 到 {start_page - 1} 为 {before_pdf_path}") + # 如果传入的 output_suffix 是 'default',则使用本地生成的 output_suffix + if output_suffix == "default": + output_suffix = local_output_suffix - output_doc = PdfWriter() - for page_num in range(start_page, end_page + 1): - output_doc.add_page(pdf_document.pages[page_num]) - with open(output_pdf_path, 'wb') as f: - output_doc.write(f) - print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") - return output_pdf_path - except Exception as e: - print(f"Error in save_extracted_pages: {e}") - return "" # 返回空字符串 + # 将原先的 process_files 逻辑合并到此处 + pdf_path = convert_to_pdf(input_path) + result = extract_pages( + pdf_path, + output_folder, + begin_pattern, + begin_page, + end_pattern, + output_suffix, + logger + ) + # 根据提取结果以及不同的 output_suffix 进行处理 + if result: + if output_suffix == "tobidders_notice": + # 确保返回的是元组,并将其中的 None 转换为 "" + if isinstance(result, tuple) and len(result) == 2: + path1, path2 = result + return [path1 or "", path2 or ""] + else: + logger.error("Unexpected result format for 'tobidders_notice'") + return ["",""] + elif output_suffix == "qualification1": + merge_and_cleanup(result, "qualification3") + return [result or ""] + else: + return [result or ""] + return [""] * (2 if selection == 4 else 1) + except Exception as e: + logger.error(f"Error in processing file '{input_path}': {e}") + return [""] * (2 if selection == 4 else 1) -def truncate_pdf_main_goods(input_path, output_folder, selection, output_suffix="default"): - try: # 检查是否为文件夹 if os.path.isdir(input_path): generated_files = [] for file_name in os.listdir(input_path): file_path = os.path.join(input_path, file_name) if is_pdf_or_doc(file_path): - result = process_input(file_path, output_folder, selection, output_suffix) + result = process_single_file(file_path, output_folder, selection, output_suffix) if isinstance(result, tuple): generated_files.extend([f if f else "" for f in result]) else: @@ -533,102 +591,17 @@ def truncate_pdf_main_goods(input_path, output_folder, selection, output_suffix= # 单文件情况 elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): - return process_input(input_path, output_folder, selection, output_suffix) + return process_single_file(input_path, output_folder, selection, output_suffix) else: - print("提供的路径既不是文件夹也不是PDF文件。") - return [''] + logger.error("提供的路径既不是文件夹也不是PDF文件。") + return [""] * (2 if selection == 4 else 1) except Exception as e: - print(f"Error in truncate_pdf_main: {e}") - return [''] # 返回空字符串 + logger.error(f"Error in truncate_pdf_main_goods: {e}") + return [""] * (2 if selection == 4 else 1) # 返回空字符串 -def process_input(input_path, output_folder, selection, output_suffix): - try: - # 创建输出文件夹 - if not os.path.exists(output_folder): - os.makedirs(output_folder) - - # 获取起始和通用页眉 - common_header, last_begin_index = get_start_and_common_header(input_path,10) - begin_page = last_begin_index if last_begin_index != 0 else { - 4: 1, - 2: 5, - 3: 5, - 1: 0, - 5: 3 - }.get(selection, 0) - - # 根据选择设置对应的模式和结束模式 - if selection == 1: - begin_pattern = regex.compile( - r'.*(?