#flask_app/general/截取pdf通用函数.py import concurrent.futures import os import regex from PyPDF2 import PdfReader, PdfWriter from flask_app.general.clean_pdf import extract_common_header, clean_page_content from flask_app.general.format_change import docx2pdf import concurrent.futures def get_start_and_common_header(input_path,end_page): common_header = extract_common_header(input_path) last_begin_index = 0 begin_pattern = regex.compile( r'.*(? end_page: return common_header, 0 # 如果页码大于end_page,直接返回last_begin_index=0 text = page.extract_text() if text: cleaned_text = clean_page_content(text, common_header) # 检查是否存在"目录" if catalog_pattern.search(cleaned_text): continue # 如果存在目录,跳过当前页面 if begin_pattern.search(cleaned_text): last_begin_index = i # 更新第一个匹配的索引,页码从0开始 return common_header, last_begin_index return common_header, last_begin_index def check_pdf_pages(pdf_path,mode, logger): try: reader = PdfReader(pdf_path) num_pages = len(reader.pages) logger.info(f"PDF '{pdf_path}' 的总页数为: {num_pages}") if num_pages <= 30: logger.info("PDF(invalid_path)页数小于或等于30页,跳过切分逻辑。") if mode=='goods': return True,['', '', '', '', '', '', pdf_path,''] else: return True,['', '', '', '', '', pdf_path, ''] # 若页数大于30页,返回None表示继续处理 return False, [] except Exception as e: logger.error(f"无法读取 PDF 页数: {e}") # 返回空列表意味着无法执行后续处理逻辑 if mode == 'goods': return True,['', '', '', '', '', '', pdf_path, ''] else: return True,['', '', '', '', '', pdf_path, ''] def save_extracted_pages(pdf_path,output_folder,start_page, end_page, output_suffix,common_header): try: # 检查 start_page 和 end_page 是否为 None if start_page is None or end_page is None: print("Error: start_page 或 end_page 为 None") return "" pdf_document = PdfReader(pdf_path) total_pages = len(pdf_document.pages) base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") if start_page < 0 or end_page >= total_pages or start_page > end_page: print(f"无效的页面范围: {start_page} 到 {end_page}") return "" if output_suffix == 'notice' and start_page > 0: before_pdf_path = os.path.join(output_folder, f"{base_file_name}_before.pdf") before_doc = PdfWriter() toc_page = -1 # 查找目录页 for page_num in range(min(start_page, total_pages)): page_text = pdf_document.pages[page_num].extract_text() cleaned_text = clean_page_content(page_text, common_header) if regex.search(r'目\s*录', cleaned_text, regex.MULTILINE): toc_page = page_num break # 确定截取的页数 pages_to_extract = toc_page + 1 if toc_page != -1 else start_page # 提取页面 for page_num in range(pages_to_extract): before_doc.add_page(pdf_document.pages[page_num]) print(before_pdf_path) with open(before_pdf_path, 'wb') as f_before: before_doc.write(f_before) output_doc = PdfWriter() for page_num in range(start_page, end_page + 1): output_doc.add_page(pdf_document.pages[page_num]) with open(output_pdf_path, 'wb') as f: output_doc.write(f) print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") return output_pdf_path except Exception as e: print(f"Error in save_extracted_pages: {e}") return "" # 返回空字符串 def is_pdf_or_doc(filename): # 判断文件是否为PDF或Word文档 return filename.lower().endswith(('.pdf', '.doc', '.docx')) def convert_to_pdf(file_path): # 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换 if file_path.lower().endswith(('.doc', '.docx')): return docx2pdf(file_path) return file_path def get_invalid_file(file_path, output_folder, common_header,begin_page): """ 从指定的PDF文件中提取无效部分并保存到输出文件夹中。 begin_pattern从前往后匹配 end_pattern从后往前 页数小于100不进行begin_pattern,默认start_page=0 页数大于200时end_pattern从前往后匹配 Args: file_path (str): 输入的PDF文件路径。 output_folder (str): 提取后文件的输出文件夹路径。 common_header (str): 公共头部文本,用于清理每页的内容。 Returns: list: 包含保存的文件路径的列表,如果提取失败则返回包含空字符串的列表。 """ # 定义开始模式 begin_patterns = [ regex.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:公告|邀请书|邀请函|邀请).*|^第一卷|^投标邀请书|^投标邀请函|^投标邀请', regex.MULTILINE ), regex.compile( r'.*(? 200: # print("总页数大于200页,结束页从前往后查找,跳过前30页。") for pattern in end_patterns: for i in range(30, total_pages): text = page_texts[i] if regex.search(pattern, text): # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") return i else: # print("结束页从后往前查找,确保只剩前30页时停止。") for pattern in end_patterns: for i in range(total_pages - 1, 29, -1): text = page_texts[i] if regex.search(pattern, text): # print(f"结束模式在第 {i + 1} 页匹配: {pattern.pattern}") return i # 如果没有匹配到,设置end_page逻辑 if total_pages > 100: print("未找到结束模式,总页数大于100,设置结束页为前100页。"+file_path) return max(100, int(total_pages * 2 / 3)) else: print("未找到结束模式,设置结束页为最后一页。"+file_path) return total_pages - 1 # 根据总页数决定是否查找起始页 if total_pages < 100: # print("总页数少于100页,默认起始页为第一页。") start_page = 0 with concurrent.futures.ThreadPoolExecutor() as executor: future_end = executor.submit(find_end_page) end_page = future_end.result() else: with concurrent.futures.ThreadPoolExecutor() as executor: future_start = executor.submit(find_start_page,begin_page) future_end = executor.submit(find_end_page) start_page = future_start.result() end_page = future_end.result() # print(f"起始页: {start_page + 1}, 结束页: {end_page + 1}") # 验证页码范围 if start_page > end_page: print(f"无效的页码范围: 起始页 ({start_page + 1}) > 结束页 ({end_page + 1})") return "" # 调用已实现的保存函数 output_path = save_extracted_pages( pdf_path=file_path, output_folder=output_folder, start_page=start_page, end_page=end_page, output_suffix="invalid", common_header=common_header ) # print(f"提取的页面 {start_page} 到 {end_page} 已保存至 {output_path}") return output_path except Exception as e: print(f"处理文件 {file_path} 时发生错误: {e}") return "" if __name__ == "__main__": file_path=r'C:\Users\Administrator\Desktop\new招标文件\货物标\HBDL-2024-0310-001-招标文件.pdf' output_folder=r'C:\Users\Administrator\Desktop\new招标文件\货物标\tmp' res=get_invalid_file(file_path,output_folder,"")