zbparse/flask_app/general/读取文件/按页读取pdf.py

207 lines
7.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from PyPDF2 import PdfReader
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content, create_get_text_function
import fitz # PyMuPDF
def extract_text_by_page_fitz(file_path):
common_header = extract_common_header(file_path)
# print(common_header)
# common_header=""
result = ""
with fitz.open(file_path) as doc:
num_pages = len(doc)
for page_num in range(num_pages):
page = doc[page_num]
text = page.get_text("text") # 获取文本内容
if text:
cleaned_text = clean_page_content(text, common_header)
print(cleaned_text)
print("-----------------" + str(page_num))
# result += cleaned_text
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
def extract_text_by_page(file_path):
common_header = extract_common_header(file_path)
# print(common_header)
result = ""
with open(file_path, 'rb') as file:
reader =PdfReader(file)
num_pages = len(reader.pages)
# print(f"Total pages: {num_pages}")
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
# print(text)
cleaned_text = clean_page_content(text,common_header)
print(cleaned_text)
print("-----------------"+str(page_num))
# result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
def process_pages(get_text, common_header, total_pages):
"""
扫描所有页面,返回第一个和最后一个包含有效文本的页面索引。
如果所有页面都为空,则返回 0 到 total_pages - 1。
"""
start_page = None
end_page = None
for page_num in range(total_pages):
try:
text = get_text(page_num)
cleaned_text = clean_page_content(text, common_header)
except Exception as e:
print(f"读取第 {page_num} 页失败: {e}")
continue
# 判断页面是否有有效文本(去除空白字符后不为空)
if text.strip():
if start_page is None:
start_page = page_num
end_page = page_num
# 如果所有页面均为空,默认返回全范围
if start_page is None:
start_page = 0
end_page = total_pages - 1
return start_page, end_page
def read_pdf_main(pdf_path):
# common_header=extract_common_header(pdf_path)
common_header=""
try:
with fitz.open(pdf_path) as pdf_document:
total_pages = pdf_document.page_count
get_text = create_get_text_function('fitz', pdf_document)
start_page, end_page = process_pages(get_text, common_header,total_pages)
return start_page, end_page
except Exception as e_pypdf2:
print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
# 如果 PyPDF2 失败,尝试使用 PyMuPDF
try:
with open(pdf_path, "rb") as f:
pdf_document = PdfReader(f)
total_pages = len(pdf_document.pages)
get_text = create_get_text_function('pypdf2', pdf_document)
start_page, end_page = process_pages(get_text, common_header,total_pages)
return start_page, end_page
except Exception as e_pypdf2:
print(f"extract_pages_generic: 使用 fitz 读取 PDF 失败: {e_pypdf2}")
def save_extracted_text_to_txt(pdf_path, txt_path):
"""
调用 extract_text_by_page 函数提取PDF文本并将其保存到指定的TXT文件中。
:param pdf_path: 输入的PDF文件路径
:param txt_path: 输出的TXT文件路径
"""
try:
# 提取文本内容
extracted_text = extract_text_by_page(pdf_path)
# 将提取的文本写入TXT文件
with open(txt_path, 'w', encoding='utf-8') as txt_file:
txt_file.write(extracted_text)
print(f"文本已成功提取并保存到 {txt_path}")
except FileNotFoundError:
print(f"文件未找到: {pdf_path}")
except Exception as e:
print(f"发生错误: {e}")
# import fitz # PyMuPDF
# import re
#
#
# def clean_page_content(text, common_header):
# if common_header:
# for header_line in common_header.split('\n'):
# if header_line.strip():
# text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
#
# text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)
# text = re.sub(r'\s+\d+\s*$', '', text)
# text = re.sub(r'\s*\/\s*\d+\s*', '', text)
#
# return text
#
#
# def extract_common_header(pdf_path):
# doc = fitz.open(pdf_path)
# headers = []
# total_pages = len(doc)
#
# if total_pages == 2:
# pages_to_read = 2
# start_page = 0
# else:
# pages_to_read = 3
# middle_page = total_pages // 2
# start_page = max(0, middle_page - 1)
#
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
# page = doc[i]
# text = page.get_text()
# if text:
# first_lines = text.strip().split('\n')[:3]
# headers.append(first_lines)
#
# doc.close()
#
# if len(headers) < 2:
# return ""
#
# common_headers = []
# for lines in zip(*headers):
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
# if common_line:
# common_headers.append(' '.join(common_line))
#
# return '\n'.join(common_headers)
#
#
# def extract_text_by_page(file_path):
# common_header = extract_common_header(file_path)
# result = ""
# doc = fitz.open(file_path)
# num_pages = len(doc)
#
# for page_num in range(num_pages):
# page = doc[page_num]
# text = page.get_text()
# if text:
# cleaned_text = clean_page_content(text, common_header)
# print(cleaned_text)
# result += cleaned_text
# else:
# print(f"Page {page_num + 1} is empty or text could not be extracted.")
#
# doc.close()
# return result
if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
pdf_path=r'C:\Users\Administrator\Downloads\完全没有解析进度1_南网超高压公司2024年第四批服务公开招标项目2024-FW-4-S-ZB2 (1).pdf'
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新W改_evaluation_method.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
# ress = extract_common_header(pdf_path)
# print(ress)
# print("-----------------")
res=extract_text_by_page_fitz(pdf_path)
# print(res)磋商文件_tobidders_notice_part2.pdf
# save_extracted_text_to_txt(file_path,"output.txt")