208 lines
7.4 KiB
Python
208 lines
7.4 KiB
Python
from pypdf import PdfReader
|
||
|
||
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content, create_get_text_function
|
||
import fitz # PyMuPDF
|
||
|
||
def extract_text_by_page_fitz(file_path):
|
||
common_header = extract_common_header(file_path)
|
||
# print(common_header)
|
||
# common_header=""
|
||
result = ""
|
||
|
||
with fitz.open(file_path) as doc:
|
||
num_pages = len(doc)
|
||
for page_num in range(num_pages):
|
||
page = doc[page_num]
|
||
text = page.get_text("text") # 获取文本内容
|
||
if text:
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
print(cleaned_text)
|
||
print("-----------------" + str(page_num))
|
||
# result += cleaned_text
|
||
else:
|
||
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||
|
||
return result
|
||
|
||
|
||
def extract_text_by_page(file_path):
|
||
common_header=""
|
||
# common_header = extract_common_header(file_path)
|
||
# print(common_header)
|
||
result = ""
|
||
with open(file_path, 'rb') as file:
|
||
reader =PdfReader(file)
|
||
num_pages = len(reader.pages)
|
||
# print(f"Total pages: {num_pages}")
|
||
for page_num in range(num_pages):
|
||
page = reader.pages[page_num]
|
||
text = page.extract_text()
|
||
if text:
|
||
# print(text)
|
||
cleaned_text = clean_page_content(text,common_header)
|
||
print(cleaned_text)
|
||
print("-----------------"+str(page_num))
|
||
# result += cleaned_text
|
||
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
||
else:
|
||
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||
return result
|
||
|
||
|
||
def process_pages(get_text, common_header, total_pages):
|
||
"""
|
||
扫描所有页面,返回第一个和最后一个包含有效文本的页面索引。
|
||
如果所有页面都为空,则返回 0 到 total_pages - 1。
|
||
"""
|
||
start_page = None
|
||
end_page = None
|
||
for page_num in range(total_pages):
|
||
try:
|
||
text = get_text(page_num)
|
||
cleaned_text = clean_page_content(text, common_header)
|
||
except Exception as e:
|
||
print(f"读取第 {page_num} 页失败: {e}")
|
||
continue
|
||
|
||
# 判断页面是否有有效文本(去除空白字符后不为空)
|
||
if text.strip():
|
||
if start_page is None:
|
||
start_page = page_num
|
||
end_page = page_num
|
||
|
||
# 如果所有页面均为空,默认返回全范围
|
||
if start_page is None:
|
||
start_page = 0
|
||
end_page = total_pages - 1
|
||
|
||
return start_page, end_page
|
||
def read_pdf_main(pdf_path):
|
||
# common_header=extract_common_header(pdf_path)
|
||
common_header=""
|
||
try:
|
||
with open(pdf_path, "rb") as f:
|
||
pdf_document = PdfReader(f)
|
||
total_pages = len(pdf_document.pages)
|
||
get_text = create_get_text_function('pypdf2', pdf_document)
|
||
start_page, end_page = process_pages(get_text, common_header,total_pages)
|
||
return start_page, end_page
|
||
except Exception as e_pypdf2:
|
||
print(f"extract_pages_generic: 使用 PyPDF2 读取 PDF 失败: {e_pypdf2}")
|
||
|
||
# 如果 PyPDF2 失败,尝试使用 PyMuPDF
|
||
try:
|
||
with fitz.open(pdf_path) as pdf_document:
|
||
total_pages = pdf_document.page_count
|
||
get_text = create_get_text_function('fitz', pdf_document)
|
||
start_page, end_page = process_pages(get_text, common_header,total_pages)
|
||
return start_page, end_page
|
||
except Exception as e_pypdf2:
|
||
print(f"extract_pages_generic: 使用 fitz 读取 PDF 失败: {e_pypdf2}")
|
||
|
||
def save_extracted_text_to_txt(pdf_path, txt_path):
|
||
"""
|
||
调用 extract_text_by_page 函数提取PDF文本,并将其保存到指定的TXT文件中。
|
||
|
||
:param pdf_path: 输入的PDF文件路径
|
||
:param txt_path: 输出的TXT文件路径
|
||
"""
|
||
try:
|
||
# 提取文本内容
|
||
extracted_text = extract_text_by_page(pdf_path)
|
||
|
||
# 将提取的文本写入TXT文件
|
||
with open(txt_path, 'w', encoding='utf-8') as txt_file:
|
||
txt_file.write(extracted_text)
|
||
|
||
print(f"文本已成功提取并保存到 {txt_path}")
|
||
|
||
except FileNotFoundError:
|
||
print(f"文件未找到: {pdf_path}")
|
||
except Exception as e:
|
||
print(f"发生错误: {e}")
|
||
|
||
# import fitz # PyMuPDF
|
||
# import re
|
||
#
|
||
#
|
||
# def clean_page_content(text, common_header):
|
||
# if common_header:
|
||
# for header_line in common_header.split('\n'):
|
||
# if header_line.strip():
|
||
# text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||
#
|
||
# text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)
|
||
# text = re.sub(r'\s+\d+\s*$', '', text)
|
||
# text = re.sub(r'\s*\/\s*\d+\s*', '', text)
|
||
#
|
||
# return text
|
||
#
|
||
#
|
||
# def extract_common_header(pdf_path):
|
||
# doc = fitz.open(pdf_path)
|
||
# headers = []
|
||
# total_pages = len(doc)
|
||
#
|
||
# if total_pages == 2:
|
||
# pages_to_read = 2
|
||
# start_page = 0
|
||
# else:
|
||
# pages_to_read = 3
|
||
# middle_page = total_pages // 2
|
||
# start_page = max(0, middle_page - 1)
|
||
#
|
||
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
||
# page = doc[i]
|
||
# text = page.get_text()
|
||
# if text:
|
||
# first_lines = text.strip().split('\n')[:3]
|
||
# headers.append(first_lines)
|
||
#
|
||
# doc.close()
|
||
#
|
||
# if len(headers) < 2:
|
||
# return ""
|
||
#
|
||
# common_headers = []
|
||
# for lines in zip(*headers):
|
||
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
|
||
# if common_line:
|
||
# common_headers.append(' '.join(common_line))
|
||
#
|
||
# return '\n'.join(common_headers)
|
||
#
|
||
#
|
||
# def extract_text_by_page(file_path):
|
||
# common_header = extract_common_header(file_path)
|
||
# result = ""
|
||
# doc = fitz.open(file_path)
|
||
# num_pages = len(doc)
|
||
#
|
||
# for page_num in range(num_pages):
|
||
# page = doc[page_num]
|
||
# text = page.get_text()
|
||
# if text:
|
||
# cleaned_text = clean_page_content(text, common_header)
|
||
# print(cleaned_text)
|
||
# result += cleaned_text
|
||
# else:
|
||
# print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
||
#
|
||
# doc.close()
|
||
# return result
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
|
||
pdf_path=r'C:\Users\Administrator\Downloads\(完全没有解析进度)1_南网超高压公司2024年第四批服务公开招标项目(2024-FW-4-S-ZB2) (1).pdf'
|
||
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新(W改)_evaluation_method.pdf"
|
||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
|
||
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
|
||
# ress = extract_common_header(pdf_path)
|
||
# print(ress)
|
||
# print("-----------------")
|
||
res=extract_text_by_page_fitz(pdf_path)
|
||
# print(res)磋商文件_tobidders_notice_part2.pdf
|
||
# save_extracted_text_to_txt(file_path,"output.txt")
|