import PyPDF2 import re # 导入正则表达式库 from PyPDF2 import PdfReader def clean_page_content(text, common_header): # 首先删除抬头公共部分 if common_header: # 确保有公共抬头才进行替换 for header_line in common_header.split('\n'): if header_line.strip(): # 只处理非空行 # 替换首次出现的完整行 text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) # 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除 text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 return text def extract_common_header(pdf_path): from PyPDF2 import PdfReader pdf_document = PdfReader(pdf_path) headers = [] total_pages = len(pdf_document.pages) # 确定要读取的页数和起始页 if total_pages == 2: pages_to_read = 2 start_page = 0 else: pages_to_read = 3 middle_page = total_pages // 2 start_page = max(0, middle_page - 1) for i in range(start_page, min(start_page + pages_to_read, total_pages)): page = pdf_document.pages[i] text = page.extract_text() or "" if text: # 只取每页的前三行 first_lines = text.strip().split('\n')[:3] headers.append(first_lines) if len(headers) < 2: return "" # 如果没有足够的页来比较,返回空字符串 # 寻找每一行中的公共部分,按顺序保留 common_headers = [] for lines in zip(*headers): # 提取第一行的词汇顺序 first_words = lines[0].split() # 筛选所有页面都包含的词汇,保持顺序 common_line = [word for word in first_words if all(word in line.split() for line in lines[1:])] if common_line: common_headers.append(' '.join(common_line)) return '\n'.join(common_headers) def extract_text_by_page(file_path): common_header = extract_common_header(file_path) result = "" with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) num_pages = len(reader.pages) # print(f"Total pages: {num_pages}") for page_num in range(num_pages): page = reader.pages[page_num] text = page.extract_text() if text: cleaned_text = clean_page_content(text,common_header) print(cleaned_text) result += cleaned_text # print(f"Page {page_num + 1} Content:\n{cleaned_text}") else: print(f"Page {page_num + 1} is empty or text could not be extracted.") return result # import fitz # PyMuPDF # import re # # # def clean_page_content(text, common_header): # if common_header: # for header_line in common_header.split('\n'): # if header_line.strip(): # text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) # # text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # text = re.sub(r'\s+\d+\s*$', '', text) # text = re.sub(r'\s*\/\s*\d+\s*', '', text) # # return text # # # def extract_common_header(pdf_path): # doc = fitz.open(pdf_path) # headers = [] # total_pages = len(doc) # # if total_pages == 2: # pages_to_read = 2 # start_page = 0 # else: # pages_to_read = 3 # middle_page = total_pages // 2 # start_page = max(0, middle_page - 1) # # for i in range(start_page, min(start_page + pages_to_read, total_pages)): # page = doc[i] # text = page.get_text() # if text: # first_lines = text.strip().split('\n')[:3] # headers.append(first_lines) # # doc.close() # # if len(headers) < 2: # return "" # # common_headers = [] # for lines in zip(*headers): # common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]]) # if common_line: # common_headers.append(' '.join(common_line)) # # return '\n'.join(common_headers) # # # def extract_text_by_page(file_path): # common_header = extract_common_header(file_path) # result = "" # doc = fitz.open(file_path) # num_pages = len(doc) # # for page_num in range(num_pages): # page = doc[page_num] # text = page.get_text() # if text: # cleaned_text = clean_page_content(text, common_header) # print(cleaned_text) # result += cleaned_text # else: # print(f"Page {page_num + 1} is empty or text could not be extracted.") # # doc.close() # return result if __name__ == '__main__': # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf' # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" # ress = extract_common_header(file_path) # print(ress) res=extract_text_by_page(file_path) # print(res)磋商文件_tobidders_notice_part2.pdf