from PyPDF2 import PdfReader from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content, create_get_text_function import fitz # PyMuPDF def extract_text_by_page_fitz(file_path): common_header = extract_common_header(file_path) # print(common_header) # common_header="" result = "" with fitz.open(file_path) as doc: num_pages = len(doc) for page_num in range(num_pages): page = doc[page_num] text = page.get_text("text") # 获取文本内容 if text: cleaned_text = clean_page_content(text, common_header) print(cleaned_text) print("-----------------" + str(page_num)) # result += cleaned_text else: print(f"Page {page_num + 1} is empty or text could not be extracted.") return result def extract_text_by_page(file_path): common_header = extract_common_header(file_path) # print(common_header) result = "" with open(file_path, 'rb') as file: reader =PdfReader(file) num_pages = len(reader.pages) # print(f"Total pages: {num_pages}") for page_num in range(num_pages): page = reader.pages[page_num] text = page.extract_text() if text: # print(text) cleaned_text = clean_page_content(text,common_header) print(cleaned_text) print("-----------------"+str(page_num)) # result += cleaned_text # print(f"Page {page_num + 1} Content:\n{cleaned_text}") else: print(f"Page {page_num + 1} is empty or text could not be extracted.") return result def process_pages(get_text, common_header, total_pages): """ 扫描所有页面,返回第一个和最后一个包含有效文本的页面索引。 如果所有页面都为空,则返回 0 到 total_pages - 1。 """ start_page = None end_page = None for page_num in range(total_pages): try: text = get_text(page_num) cleaned_text = clean_page_content(text, common_header) except Exception as e: print(f"读取第 {page_num} 页失败: {e}") continue # 判断页面是否有有效文本(去除空白字符后不为空) if text.strip(): if start_page is None: start_page = page_num end_page = page_num # 如果所有页面均为空,默认返回全范围 if start_page is None: start_page = 0 end_page = total_pages - 1 return start_page, end_page def read_pdf_main(pdf_path): # common_header=extract_common_header(pdf_path) common_header="" try: with open(pdf_path, "rb") as f: pdf_document = PdfReader(f) total_pages = len(pdf_document.pages) get_text = create_get_text_function('pypdf2', pdf_document) start_page, end_page = process_pages(get_text, common_header,total_pages) return start_page, end_page except Exception as e_pypdf2: print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}") # 如果 PyPDF2 失败,尝试使用 PyMuPDF try: with fitz.open(pdf_path) as pdf_document: total_pages = pdf_document.page_count get_text = create_get_text_function('fitz', pdf_document) start_page, end_page = process_pages(get_text, common_header,total_pages) return start_page, end_page except Exception as e_pypdf2: print(f"extract_pages_generic: 使用 fitz 读取 PDF 失败: {e_pypdf2}") def save_extracted_text_to_txt(pdf_path, txt_path): """ 调用 extract_text_by_page 函数提取PDF文本,并将其保存到指定的TXT文件中。 :param pdf_path: 输入的PDF文件路径 :param txt_path: 输出的TXT文件路径 """ try: # 提取文本内容 extracted_text = extract_text_by_page(pdf_path) # 将提取的文本写入TXT文件 with open(txt_path, 'w', encoding='utf-8') as txt_file: txt_file.write(extracted_text) print(f"文本已成功提取并保存到 {txt_path}") except FileNotFoundError: print(f"文件未找到: {pdf_path}") except Exception as e: print(f"发生错误: {e}") # import fitz # PyMuPDF # import re # # # def clean_page_content(text, common_header): # if common_header: # for header_line in common_header.split('\n'): # if header_line.strip(): # text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) # # text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # text = re.sub(r'\s+\d+\s*$', '', text) # text = re.sub(r'\s*\/\s*\d+\s*', '', text) # # return text # # # def extract_common_header(pdf_path): # doc = fitz.open(pdf_path) # headers = [] # total_pages = len(doc) # # if total_pages == 2: # pages_to_read = 2 # start_page = 0 # else: # pages_to_read = 3 # middle_page = total_pages // 2 # start_page = max(0, middle_page - 1) # # for i in range(start_page, min(start_page + pages_to_read, total_pages)): # page = doc[i] # text = page.get_text() # if text: # first_lines = text.strip().split('\n')[:3] # headers.append(first_lines) # # doc.close() # # if len(headers) < 2: # return "" # # common_headers = [] # for lines in zip(*headers): # common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]]) # if common_line: # common_headers.append(' '.join(common_line)) # # return '\n'.join(common_headers) # # # def extract_text_by_page(file_path): # common_header = extract_common_header(file_path) # result = "" # doc = fitz.open(file_path) # num_pages = len(doc) # # for page_num in range(num_pages): # page = doc[page_num] # text = page.get_text() # if text: # cleaned_text = clean_page_content(text, common_header) # print(cleaned_text) # result += cleaned_text # else: # print(f"Page {page_num + 1} is empty or text could not be extracted.") # # doc.close() # return result if __name__ == '__main__': # file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf' pdf_path=r'C:\Users\Administrator\Downloads\(完全没有解析进度)1_南网超高压公司2024年第四批服务公开招标项目(2024-FW-4-S-ZB2) (1).pdf' # pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新(W改)_evaluation_method.pdf" # file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf' # file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf" # file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf" # ress = extract_common_header(pdf_path) # print(ress) # print("-----------------") res=extract_text_by_page_fitz(pdf_path) # print(res)磋商文件_tobidders_notice_part2.pdf # save_extracted_text_to_txt(file_path,"output.txt")