diff --git a/flask_app/general/截取pdf_main.py b/flask_app/general/截取pdf_main.py index e7aae06..34e3bc5 100644 --- a/flask_app/general/截取pdf_main.py +++ b/flask_app/general/截取pdf_main.py @@ -24,36 +24,33 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection 返回: - list: 截取和合并后的文件路径列表,如果截取或合并失败,则包含空字符串。 """ - if selections is None: - selections = [1, 2, 3, 4, 5] - # 确认模式有效 - if mode not in ['goods', 'engineering']: - raise ValueError("mode 参数必须是 'goods' 或 'engineering'") def handle_exception(selection): return ["", ""] if selection == 4 else [""] - # 设置模式相关的参数和函数 - if mode == 'goods': - logger.info("call 货物标截取pdf") - truncate_function = truncate_pdf_main_goods - merge_mode = 'goods' + modes_config = { + "goods": {"selections": [1, 2, 3, 4, 5, 6], "truncate_func": truncate_pdf_main_goods}, + "engineering": {"selections": [1, 2, 3, 4, 5], "truncate_func": truncate_pdf_main_engineering}, + } - # 根据 'goods' 模式定义异常处理的逻辑 - else: # mode == 'engineering' - logger.info("call 工程标标截取pdf") - truncate_function = truncate_pdf_main_engineering - merge_mode = 'engineering' + # 验证 mode 是否有效 + if mode not in modes_config: + raise ValueError("mode 参数必须是 'goods' 或 'engineering'") - num_selections = len(selections) - res = check_pdf_pages(pdf_path, logger) - if res is not None: - return res # 返回包含空字符串的列表 + # 初始化 mode 参数 + config = modes_config[mode] + truncate_function = config["truncate_func"] + selections = selections or config["selections"] + + # 检查 PDF 页数逻辑 + skip, empty_return = check_pdf_pages(pdf_path, mode, logger) + if skip: + return empty_return base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] # 纯文件名 truncate_files = [] # 使用 ThreadPoolExecutor 进行多线程处理 - with concurrent.futures.ThreadPoolExecutor(max_workers=num_selections) as executor: + with concurrent.futures.ThreadPoolExecutor(max_workers=len(selections)) as executor: # 提交所有任务并保持 selection 顺序 future_to_selection = { selection: executor.submit( @@ -73,14 +70,17 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection future = future_to_selection.get(selection) try: files = future.result() - if files: + # 扁平化返回的结果 + if isinstance(files, list): truncate_files.extend(files) - # 无需额外处理,因为 `truncate_function` 已处理空字符串的添加和日志记录 + elif isinstance(files, str): # 如果返回单个字符串,直接添加 + truncate_files.append(files) + else: + logger.warning(f"未知的返回类型: {type(files)},跳过该结果") except Exception as e: logger.error(f"Selection {selection} 生成了一个异常: {e}") # 根据模式和 selection 添加相应数量的空字符串 truncate_files.extend(handle_exception(selection)) - # 定义合并后的输出路径 merged_output_path = os.path.join(output_folder, f"{base_file_name}_merged_baseinfo.pdf") @@ -90,7 +90,7 @@ def truncate_pdf_multiple(pdf_path, output_folder, logger,mode='goods',selection truncate_files, merged_output_path, base_file_name, - mode=merge_mode + mode=mode ) if merged_path: @@ -109,15 +109,17 @@ if __name__ == "__main__": logger=get_global_logger("123") start_time = time.time() # input_path = r"C:\Users\Administrator\Desktop\new招标文件\工程标" - pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf" - + # pdf_path=r"C:\Users\Administrator\Desktop\货物标\zbfiles\094定稿-湖北工业大学轻武器模拟射击设备采购项目招标文件.pdf" + pdf_path = r"C:\Users\Administrator\Desktop\货物标\zbfiles\zbtest4_evaluation_method.pdf" # pdf_path = r"C:\Users\Administrator\Desktop\招标文件\招标02.pdf" + # pdf_path=r'C:\Users\Administrator\Desktop\货物标\zbfiles\招标文件(实高电子显示屏).pdf' # input_path=r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf" output_folder = r"C:\Users\Administrator\Desktop\fsdownload\91399aa4-1ee8-447d-a05b-03cd8d15ced5\tmp" # selections = [1, 4] # 仅处理 selection 4、1 - # selections=[5] + selections=[6] #engineering - files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods') + # files=truncate_pdf_multiple(pdf_path,output_folder,logger,'goods',selections) + files = truncate_pdf_multiple(pdf_path, output_folder, logger, 'engineering') print(files) # selection = 1 # 例如:1 - 招标公告, 2 - 评标办法, 3 -资格审查条件 4-投标人须知前附表+正文 5-无效标 # generated_files = truncate_pdf_main_engineering(pdf_path, output_folder, selection,logger) diff --git a/flask_app/general/截取pdf通用函数.py b/flask_app/general/截取pdf通用函数.py index 30b2ef9..9545c3a 100644 --- a/flask_app/general/截取pdf通用函数.py +++ b/flask_app/general/截取pdf通用函数.py @@ -7,6 +7,7 @@ from PyPDF2 import PdfReader, PdfWriter from flask_app.general.clean_pdf import extract_common_header, clean_page_content from flask_app.general.format_change import docx2pdf +import concurrent.futures def get_start_and_common_header(input_path,end_page): @@ -34,20 +35,26 @@ def get_start_and_common_header(input_path,end_page): return common_header, last_begin_index return common_header, last_begin_index -def check_pdf_pages(pdf_path, logger): +def check_pdf_pages(pdf_path,mode, logger): try: reader = PdfReader(pdf_path) num_pages = len(reader.pages) logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}") if num_pages <= 50: logger.info("PDF页数小于或等于50页,跳过切分逻辑。") - return ['', '', '', '', '', '', ''] + if mode=='goods': + return True,['', '', '', '', '', '', '',''] + else: + return True,['', '', '', '', '', '', ''] # 若页数大于50页,返回None表示继续处理 - return None + return False, [] except Exception as e: logger.error(f"无法读取 PDF 页数: {e}") # 返回空列表意味着无法执行后续处理逻辑 - return ['', '', '', '', '', '', ''] + if mode == 'goods': + return True,['', '', '', '', '', '', '', ''] + else: + return True,['', '', '', '', '', '', ''] def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header): @@ -108,22 +115,151 @@ def convert_to_pdf(file_path): return docx2pdf(file_path) return file_path -def get_invalid_file(file_path,output_folder,common_header): - pdf_document = PdfReader(file_path) - total_pages = len(pdf_document.pages) - begin_pattern=[regex.compile( - r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请',regex.MULTILINE - ), + +def get_invalid_file(file_path, output_folder, common_header,begin_page): + """ + 从指定的PDF文件中提取无效部分并保存到输出文件夹中。 + begin_pattern从前往后匹配 end_pattern从后往前 + 页数小于100不进行begin_pattern,默认start_page=0 + 页数大于200时end_pattern从前往后匹配 + + + Args: + file_path (str): 输入的PDF文件路径。 + output_folder (str): 提取后文件的输出文件夹路径。 + common_header (str): 公共头部文本,用于清理每页的内容。 + + Returns: + list: 包含保存的文件路径的列表,如果提取失败则返回包含空字符串的列表。 + """ + # 定义开始模式 + begin_patterns = [ regex.compile( - r'.*(? 200: + # print("总页数大于200页,结束页从前往后查找,跳过前30页。") + for pattern in end_patterns: + for i in range(30, total_pages): + text = page_texts[i] + if regex.search(pattern, text): + # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") + return i + else: + # print("结束页从后往前查找,确保只剩前30页时停止。") + for pattern in end_patterns: + for i in range(total_pages - 1, 29, -1): + text = page_texts[i] + if regex.search(pattern, text): + # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") + return i + + # 如果没有匹配到,设置end_page逻辑 + if total_pages > 100: + print("未找到结束模式,总页数大于100,设置结束页为前100页。"+file_path) + return max(100, int(total_pages * 2 / 3)) + else: + print("未找到结束模式,设置结束页为最后一页。"+file_path) + return total_pages - 1 + + # 根据总页数决定是否查找起始页 + if total_pages < 100: + # print("总页数少于100页,默认起始页为第一页。") + start_page = 0 + with concurrent.futures.ThreadPoolExecutor() as executor: + future_end = executor.submit(find_end_page) + end_page = future_end.result() + else: + with concurrent.futures.ThreadPoolExecutor() as executor: + future_start = executor.submit(find_start_page,begin_page) + future_end = executor.submit(find_end_page) + start_page = future_start.result() + end_page = future_end.result() + + # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}") + + # 验证页码范围 + if start_page > end_page: + print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})") + return [""] + + # 调用已实现的保存函数 + output_path = save_extracted_pages( + pdf_path=file_path, + output_folder=output_folder, + start_page=start_page, + end_page=end_page, + output_suffix="invalid", + common_header=common_header + ) + + # print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}") + return output_path + + except Exception as e: + print(f"处理文件 {file_path} 时发生错误: {e}") + return "" + +if __name__ == "__main__": + file_path=r'C:\Users\Administrator\Desktop\new招标文件\货物标\HBDL-2024-0310-001-招标文件.pdf' + output_folder=r'C:\Users\Administrator\Desktop\new招标文件\货物标\tmp' + res=get_invalid_file(file_path,output_folder,"") diff --git a/flask_app/general/截取文件格式.py b/flask_app/old_version/截取文件格式.py similarity index 100% rename from flask_app/general/截取文件格式.py rename to flask_app/old_version/截取文件格式.py diff --git a/flask_app/routes/工程标解析main.py b/flask_app/routes/工程标解析main.py index a33f872..4a1163b 100644 --- a/flask_app/routes/工程标解析main.py +++ b/flask_app/routes/工程标解析main.py @@ -54,7 +54,7 @@ def preprocess_files(output_folder, file_path, file_type,logger): tobidders_notice_table = truncate_files[3] #投标人须知前附表 tobidders_notice=truncate_files[4] #投标人须知正文 - invalid_path=truncate_files[5] if truncate_files[5]!="" else pdf_path #无效标 + invalid_path=truncate_files[5] if truncate_files[5] != "" else pdf_path #无效标 # invalid_docpath = copy_docx(docx_path) # docx截取无效标部分 # invalid_docpath=pdf2docx(invalid_path) invalid_added_pdf = insert_mark(invalid_path) diff --git a/flask_app/routes/货物标解析main.py b/flask_app/routes/货物标解析main.py index f443c57..9fbda5b 100644 --- a/flask_app/routes/货物标解析main.py +++ b/flask_app/routes/货物标解析main.py @@ -43,7 +43,7 @@ def preprocess_files(output_folder, file_path, file_type,logger): truncate_files = truncate_pdf_multiple(pdf_path, output_folder,logger,'goods') # index: 0->商务技术服务要求 1->评标办法 2->资格审查 3->投标人须知前附表 4->投标人须知正文 # 处理各个部分 - invalid_path=pdf_path + invalid_path=truncate_files[6] if truncate_files[6] != "" else pdf_path #无效标 invalid_added_pdf = insert_mark(invalid_path) invalid_added_docx = pdf2docx(invalid_added_pdf) # 有标记的invalid_path @@ -62,7 +62,7 @@ def preprocess_files(output_folder, file_path, file_type,logger): qualification_path = truncate_files[2] # 资格审查 tobidders_notice_path = truncate_files[4] # 投标人须知正文 notice_path = truncate_files[0] #招标公告 - merged_baseinfo_path = truncate_files[6] # 合并封面+招标公告+投标人须知前附表+须知正文 + merged_baseinfo_path = truncate_files[7] # 合并封面+招标公告+投标人须知前附表+须知正文 clause_path = convert_clause_to_json(tobidders_notice_path, output_folder) # 投标人须知正文条款pdf->json end_time = time.time() diff --git a/flask_app/test_case/test_正则表达式.py b/flask_app/test_case/test_正则表达式.py index c9ecb3b..8bb267b 100644 --- a/flask_app/test_case/test_正则表达式.py +++ b/flask_app/test_case/test_正则表达式.py @@ -1,10 +1,9 @@ import regex begin_pattern = regex.compile( - r'(?