131 lines
4.6 KiB
Python
Raw Permalink Normal View History

2024-09-19 18:00:24 +08:00
import PyPDF2
2024-11-01 14:28:10 +08:00
from flask_app.general.clean_pdf import extract_common_header, clean_page_content
2024-10-18 15:44:18 +08:00
2024-09-19 18:00:24 +08:00
def extract_text_by_page(file_path):
common_header = extract_common_header(file_path)
result = ""
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
# print(f"Total pages: {num_pages}")
for page_num in range(num_pages):
page = reader.pages[page_num]
text = page.extract_text()
if text:
2024-12-10 17:32:08 +08:00
# print(text)
2024-09-19 18:00:24 +08:00
cleaned_text = clean_page_content(text,common_header)
2024-12-10 17:32:08 +08:00
print(cleaned_text)
2024-12-06 14:40:22 +08:00
print("-----------------"+str(page_num))
2024-09-19 18:00:24 +08:00
result += cleaned_text
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
else:
print(f"Page {page_num + 1} is empty or text could not be extracted.")
return result
2024-10-09 13:50:28 +08:00
def save_extracted_text_to_txt(pdf_path, txt_path):
"""
调用 extract_text_by_page 函数提取PDF文本并将其保存到指定的TXT文件中
:param pdf_path: 输入的PDF文件路径
:param txt_path: 输出的TXT文件路径
"""
try:
# 提取文本内容
extracted_text = extract_text_by_page(pdf_path)
# 将提取的文本写入TXT文件
with open(txt_path, 'w', encoding='utf-8') as txt_file:
txt_file.write(extracted_text)
print(f"文本已成功提取并保存到 {txt_path}")
except FileNotFoundError:
print(f"文件未找到: {pdf_path}")
except Exception as e:
print(f"发生错误: {e}")
2024-10-09 13:50:28 +08:00
# import fitz # PyMuPDF
# import re
#
#
# def clean_page_content(text, common_header):
# if common_header:
# for header_line in common_header.split('\n'):
# if header_line.strip():
# text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
#
# text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)
# text = re.sub(r'\s+\d+\s*$', '', text)
# text = re.sub(r'\s*\/\s*\d+\s*', '', text)
#
# return text
#
#
# def extract_common_header(pdf_path):
# doc = fitz.open(pdf_path)
# headers = []
# total_pages = len(doc)
#
# if total_pages == 2:
# pages_to_read = 2
# start_page = 0
# else:
# pages_to_read = 3
# middle_page = total_pages // 2
# start_page = max(0, middle_page - 1)
#
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
# page = doc[i]
# text = page.get_text()
# if text:
# first_lines = text.strip().split('\n')[:3]
# headers.append(first_lines)
#
# doc.close()
#
# if len(headers) < 2:
# return ""
#
# common_headers = []
# for lines in zip(*headers):
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
# if common_line:
# common_headers.append(' '.join(common_line))
#
# return '\n'.join(common_headers)
#
#
# def extract_text_by_page(file_path):
# common_header = extract_common_header(file_path)
# result = ""
# doc = fitz.open(file_path)
# num_pages = len(doc)
#
# for page_num in range(num_pages):
# page = doc[page_num]
# text = page.get_text()
# if text:
# cleaned_text = clean_page_content(text, common_header)
# print(cleaned_text)
# result += cleaned_text
# else:
# print(f"Page {page_num + 1} is empty or text could not be extracted.")
#
# doc.close()
# return result
2024-09-19 18:00:24 +08:00
if __name__ == '__main__':
2024-11-06 17:29:45 +08:00
# file_path='D:\\flask_project\\flask_app\\static\\output\\output1\\648e094b-e677-47ce-9073-09e0c82af210\\ztbfile_tobidders_notice_part2.pdf'
2024-12-23 15:47:41 +08:00
file_path=r"D:\flask_project\flask_app\static\output\output1\1571dc96-4bb3-4ab2-a45a-993ed82678e8\ztbfile.pdf"
2024-12-10 17:32:08 +08:00
# file_path = r"C:\Users\Administrator\Desktop\招标文件\招标test文件夹\zbtest8.pdf"
2024-10-18 15:44:18 +08:00
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
2024-10-29 20:40:14 +08:00
# file_path = "C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest8.pdf"
2024-10-31 15:03:32 +08:00
# file_path="C:\\Users\\Administrator\\Desktop\\fsdownload\\45f650ce-e519-457b-9ad6-5840e2ede539\\ztbfile_procurement.pdf"
2024-12-06 14:40:22 +08:00
ress = extract_common_header(file_path)
print(ress)
print("-----------------")
2024-12-05 15:01:37 +08:00
res=extract_text_by_page(file_path)
# print(res)磋商文件_tobidders_notice_part2.pdf
2024-12-05 15:01:37 +08:00
# save_extracted_text_to_txt(file_path,"output.txt")