from PyPDF2 import PdfReader, PdfWriter import re # 导入正则表达式库 import os # 用于文件和文件夹操作 # from flask_app.general.format_change import docx2pdf # 确保此模块可用 def clean_page_content(text, common_header): """ 清理页面内容,通过删除公共抬头和页码。 """ if common_header: for header_line in common_header.split('\n'): if header_line.strip(): # 替换首次出现的完整行 text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) # 删除页码,合并多个正则表达式为一个 text = re.sub( r'(^\s*\d+\s*(?=\D))|(\s+\d+\s*$)|(\s*/\s*\d+\s*)|(\s*[—-]\s*\d+\s*[—-]\s*)', '', text, flags=re.MULTILINE ) return text def extract_common_header(pdf_path): """ 从PDF的前几页提取公共抬头。 """ pdf_document = PdfReader(pdf_path) headers = [] total_pages = len(pdf_document.pages) # 确定要读取的页数和起始页 pages_to_read = 2 if total_pages == 2 else 3 start_page = 0 if total_pages == 2 else max(0, (total_pages // 2) - 1) for i in range(start_page, min(start_page + pages_to_read, total_pages)): page = pdf_document.pages[i] text = page.extract_text() or "" if text: # 只取每页的前三行 first_lines = text.strip().split('\n')[:3] headers.append(set(line.strip() for line in first_lines if line.strip())) if len(headers) < 2: return "" # 如果没有足够的页来比较,返回空字符串 # 寻找每一行中的公共部分,按顺序保留 common_headers = set.intersection(*headers) return '\n'.join(common_headers) def is_pdf_or_doc(filename): """ 判断文件是否为PDF或Word文档。 """ return filename.lower().endswith(('.pdf', '.doc', '.docx')) # def convert_to_pdf(file_path): # """ # 如果是Word文档,则转换为PDF。 # """ # if file_path.lower().endswith(('.doc', '.docx')): # return docx2pdf(file_path) # return file_path def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): """ 将提取的页面保存为新的PDF文件。 """ try: base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") if start_page is None or end_page is None or start_page > end_page: print(f"无效的页面范围: {start_page} 到 {end_page} 文件: {pdf_path}") return "" output_doc = PdfWriter() for page_num in range(start_page, end_page + 1): output_doc.add_page(pdf_document.pages[page_num]) with open(output_pdf_path, 'wb') as f: output_doc.write(f) print(f"{output_suffix} 已截取并保存页面从 {start_page} 到 {end_page} 为 {output_pdf_path}") return output_pdf_path except Exception as e: print(f"Error in save_extracted_pages for file {pdf_path}: {e}") return "" # 返回空字符串 def find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header): """ 查找起始页和结束页的索引。 """ start_page = None end_page = None total_pages = len(pdf_document.pages) for i, page in enumerate(pdf_document.pages): if i <= begin_page: continue text = page.extract_text() or "" cleaned_text = clean_page_content(text, common_header) if start_page is None and re.search(begin_pattern, cleaned_text): start_page = i continue if start_page is not None and re.search(end_pattern, cleaned_text): end_page = i break return start_page, end_page if end_page else total_pages - 1 def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): """ 根据开始和结束模式提取PDF页面。 """ try: common_header = extract_common_header(pdf_path) pdf_document = PdfReader(pdf_path) start_page, end_page = find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header) if output_suffix == "format" and start_page is None: print(f"{output_suffix}: 未找到起始页,尝试二次提取! 文件: {pdf_path}") # 二次提取逻辑,保留原有功能 start_page, end_page = find_page_indices( pdf_document, re.compile(r'^(?:响应|投标).*?格式.*', re.MULTILINE), end_pattern, begin_page, common_header ) if start_page is None: print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}") return "" if start_page is None: print(f"未找到起始页,提取失败! 文件: {pdf_path}") return "" if end_page is None: end_page = len(pdf_document.pages) - 1 print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}") return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) except Exception as e: print(f"Error processing {pdf_path}: {e}") return "" def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): """ 处理单个文件:转换为PDF(如果需要)并提取页面。 """ # pdf_path = convert_to_pdf(file_path) pdf_path=file_path return extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or "" def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): """ 处理输入路径,可以是文件或目录。 """ if not os.path.exists(output_folder): os.makedirs(output_folder) generated_files = [] if os.path.isdir(input_path): for file_name in os.listdir(input_path): file_path = os.path.join(input_path, file_name) if is_pdf_or_doc(file_path): result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) generated_files.append(result) elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) generated_files.append(result) else: print("提供的路径既不是文件夹也不是PDF/Word文件。") return [f for f in generated_files if f] # 过滤空字符串 def truncate_pdf_main(input_path, output_folder, selection): """ 主函数,根据选择截取PDF内容。 """ try: if selection == 1: # 投标文件格式 begin_page = 10 begin_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', re.MULTILINE ) end_pattern = re.compile( r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE ) local_output_suffix = "format" else: print("无效的选择:请选择1") return [] # 调用相应的处理函数 return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix) except Exception as e: print(f"Error in truncate_pdf_main: {e}") return [] if __name__ == "__main__": # 定义输入和输出路径 input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf" output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹" selection = 1 # 1 - 投标文件格式 # 执行截取 generated_files = truncate_pdf_main(input_path, output_folder, selection) print("生成的文件:", generated_files)