zbparse/flask_app/general/读取文件/按页读取pdf.py

155 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import PyPDF2
from flask_app.general.读取文件.clean_pdf import extract_common_header, clean_page_content
import fitz # PyMuPDF
def extract_text_by_page_fitz(file_path):
common_header = extract_common_header(file_path)
# print(common_header)
# common_header=""
result = ""
with fitz.open(file_path) as doc:
num_pages = len(doc)
for page_num in range(num_pages):
page = doc[page_num]
text = page.get_text("text") # 获取文本内容
if text:
cleaned_text = clean_page_content(text, common_header)
print(cleaned_text)
print("-----------------" + str(page_num))
# result += cleaned_text
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
def extract_text_by_page(file_path):
common_header = extract_common_header(file_path)
# print(common_header)
result = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
# print(f"Total pages: {num_pages}")
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
# print(text)
cleaned_text = clean_page_content(text,common_header)
print(cleaned_text)
print("-----------------"+str(page_num))
# result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
def save_extracted_text_to_txt(pdf_path, txt_path):
"""
调用 extract_text_by_page 函数提取PDF文本并将其保存到指定的TXT文件中。
:param pdf_path: 输入的PDF文件路径
:param txt_path: 输出的TXT文件路径
"""
try:
# 提取文本内容
extracted_text = extract_text_by_page(pdf_path)
# 将提取的文本写入TXT文件
with open(txt_path, 'w', encoding='utf-8') as txt_file:
txt_file.write(extracted_text)
print(f"文本已成功提取并保存到 {txt_path}")
except FileNotFoundError:
print(f"文件未找到: {pdf_path}")
except Exception as e:
print(f"发生错误: {e}")
# import fitz # PyMuPDF
# import re
#
#
# def clean_page_content(text, common_header):
# if common_header:
# for header_line in common_header.split('\n'):
# if header_line.strip():
# text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
#
# text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)
# text = re.sub(r'\s+\d+\s*$', '', text)
# text = re.sub(r'\s*\/\s*\d+\s*', '', text)
#
# return text
#
#
# def extract_common_header(pdf_path):
# doc = fitz.open(pdf_path)
# headers = []
# total_pages = len(doc)
#
# if total_pages == 2:
# pages_to_read = 2
# start_page = 0
# else:
# pages_to_read = 3
# middle_page = total_pages // 2
# start_page = max(0, middle_page - 1)
#
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
# page = doc[i]
# text = page.get_text()
# if text:
# first_lines = text.strip().split('\n')[:3]
# headers.append(first_lines)
#
# doc.close()
#
# if len(headers) < 2:
# return ""
#
# common_headers = []
# for lines in zip(*headers):
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
# if common_line:
# common_headers.append(' '.join(common_line))
#
# return '\n'.join(common_headers)
#
#
# def extract_text_by_page(file_path):
# common_header = extract_common_header(file_path)
# result = ""
# doc = fitz.open(file_path)
# num_pages = len(doc)
#
# for page_num in range(num_pages):
# page = doc[page_num]
# text = page.get_text()
# if text:
# cleaned_text = clean_page_content(text, common_header)
# print(cleaned_text)
# result += cleaned_text
# else:
# print(f"Page {page_num + 1} is empty or text could not be extracted.")
#
# doc.close()
# return result
if __name__ == '__main__':
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
pdf_path=r'C:\Users\Administrator\Downloads\完全没有解析进度1_南网超高压公司2024年第四批服务公开招标项目2024-FW-4-S-ZB2 (1).pdf'
# pdf_path=r"C:\Users\Administrator\Desktop\货物标\output2\广水市妇幼招标文件最新W改_evaluation_method.pdf"
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
# ress = extract_common_header(pdf_path)
# print(ress)
# print("-----------------")
res=extract_text_by_page_fitz(pdf_path)
# print(res)磋商文件_tobidders_notice_part2.pdf
# save_extracted_text_to_txt(file_path,"output.txt")