2024-09-19 18:00:24 +08:00
|
|
|
|
import PyPDF2
|
|
|
|
|
import re # 导入正则表达式库
|
|
|
|
|
|
|
|
|
|
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_page_content(text, common_header):
|
|
|
|
|
# 首先删除抬头公共部分
|
|
|
|
|
if common_header: # 确保有公共抬头才进行替换
|
|
|
|
|
for header_line in common_header.split('\n'):
|
|
|
|
|
if header_line.strip(): # 只处理非空行
|
|
|
|
|
# 替换首次出现的完整行
|
|
|
|
|
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
|
|
|
|
|
|
|
|
|
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
|
|
|
|
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
|
|
|
|
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
|
|
|
|
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
|
|
|
|
|
|
|
|
|
return text
|
|
|
|
|
def extract_common_header(pdf_path):
|
2024-10-18 15:44:18 +08:00
|
|
|
|
from PyPDF2 import PdfReader
|
|
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
|
pdf_document = PdfReader(pdf_path)
|
|
|
|
|
headers = []
|
|
|
|
|
total_pages = len(pdf_document.pages)
|
|
|
|
|
|
|
|
|
|
# 确定要读取的页数和起始页
|
|
|
|
|
if total_pages == 2:
|
|
|
|
|
pages_to_read = 2
|
|
|
|
|
start_page = 0
|
|
|
|
|
else:
|
|
|
|
|
pages_to_read = 3
|
|
|
|
|
middle_page = total_pages // 2
|
|
|
|
|
start_page = max(0, middle_page - 1)
|
|
|
|
|
|
|
|
|
|
for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
|
|
|
|
page = pdf_document.pages[i]
|
|
|
|
|
text = page.extract_text() or ""
|
|
|
|
|
if text:
|
|
|
|
|
# 只取每页的前三行
|
|
|
|
|
first_lines = text.strip().split('\n')[:3]
|
|
|
|
|
headers.append(first_lines)
|
|
|
|
|
|
|
|
|
|
if len(headers) < 2:
|
|
|
|
|
return "" # 如果没有足够的页来比较,返回空字符串
|
|
|
|
|
|
2024-10-18 15:44:18 +08:00
|
|
|
|
# 寻找每一行中的公共部分,按顺序保留
|
2024-09-19 18:00:24 +08:00
|
|
|
|
common_headers = []
|
|
|
|
|
for lines in zip(*headers):
|
2024-10-18 15:44:18 +08:00
|
|
|
|
# 提取第一行的词汇顺序
|
|
|
|
|
first_words = lines[0].split()
|
|
|
|
|
# 筛选所有页面都包含的词汇,保持顺序
|
|
|
|
|
common_line = [word for word in first_words if all(word in line.split() for line in lines[1:])]
|
2024-09-19 18:00:24 +08:00
|
|
|
|
if common_line:
|
|
|
|
|
common_headers.append(' '.join(common_line))
|
|
|
|
|
|
|
|
|
|
return '\n'.join(common_headers)
|
|
|
|
|
|
2024-10-18 15:44:18 +08:00
|
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
|
def extract_text_by_page(file_path):
|
|
|
|
|
common_header = extract_common_header(file_path)
|
|
|
|
|
result = ""
|
|
|
|
|
with open(file_path, 'rb') as file:
|
|
|
|
|
reader = PyPDF2.PdfReader(file)
|
|
|
|
|
num_pages = len(reader.pages)
|
|
|
|
|
# print(f"Total pages: {num_pages}")
|
|
|
|
|
for page_num in range(num_pages):
|
|
|
|
|
page = reader.pages[page_num]
|
|
|
|
|
text = page.extract_text()
|
|
|
|
|
if text:
|
|
|
|
|
cleaned_text = clean_page_content(text,common_header)
|
|
|
|
|
print(cleaned_text)
|
|
|
|
|
result += cleaned_text
|
|
|
|
|
# print(f"Page {page_num + 1} Content:\n{cleaned_text}")
|
|
|
|
|
else:
|
|
|
|
|
print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
|
|
|
|
return result
|
2024-10-09 13:50:28 +08:00
|
|
|
|
|
|
|
|
|
# import fitz # PyMuPDF
|
|
|
|
|
# import re
|
|
|
|
|
#
|
|
|
|
|
#
|
|
|
|
|
# def clean_page_content(text, common_header):
|
|
|
|
|
# if common_header:
|
|
|
|
|
# for header_line in common_header.split('\n'):
|
|
|
|
|
# if header_line.strip():
|
|
|
|
|
# text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
|
|
|
|
#
|
|
|
|
|
# text = re.sub(r'^\s*\d+\s*(?=\D)', '', text)
|
|
|
|
|
# text = re.sub(r'\s+\d+\s*$', '', text)
|
|
|
|
|
# text = re.sub(r'\s*\/\s*\d+\s*', '', text)
|
|
|
|
|
#
|
|
|
|
|
# return text
|
|
|
|
|
#
|
|
|
|
|
#
|
|
|
|
|
# def extract_common_header(pdf_path):
|
|
|
|
|
# doc = fitz.open(pdf_path)
|
|
|
|
|
# headers = []
|
|
|
|
|
# total_pages = len(doc)
|
|
|
|
|
#
|
|
|
|
|
# if total_pages == 2:
|
|
|
|
|
# pages_to_read = 2
|
|
|
|
|
# start_page = 0
|
|
|
|
|
# else:
|
|
|
|
|
# pages_to_read = 3
|
|
|
|
|
# middle_page = total_pages // 2
|
|
|
|
|
# start_page = max(0, middle_page - 1)
|
|
|
|
|
#
|
|
|
|
|
# for i in range(start_page, min(start_page + pages_to_read, total_pages)):
|
|
|
|
|
# page = doc[i]
|
|
|
|
|
# text = page.get_text()
|
|
|
|
|
# if text:
|
|
|
|
|
# first_lines = text.strip().split('\n')[:3]
|
|
|
|
|
# headers.append(first_lines)
|
|
|
|
|
#
|
|
|
|
|
# doc.close()
|
|
|
|
|
#
|
|
|
|
|
# if len(headers) < 2:
|
|
|
|
|
# return ""
|
|
|
|
|
#
|
|
|
|
|
# common_headers = []
|
|
|
|
|
# for lines in zip(*headers):
|
|
|
|
|
# common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
|
|
|
|
|
# if common_line:
|
|
|
|
|
# common_headers.append(' '.join(common_line))
|
|
|
|
|
#
|
|
|
|
|
# return '\n'.join(common_headers)
|
|
|
|
|
#
|
|
|
|
|
#
|
|
|
|
|
# def extract_text_by_page(file_path):
|
|
|
|
|
# common_header = extract_common_header(file_path)
|
|
|
|
|
# result = ""
|
|
|
|
|
# doc = fitz.open(file_path)
|
|
|
|
|
# num_pages = len(doc)
|
|
|
|
|
#
|
|
|
|
|
# for page_num in range(num_pages):
|
|
|
|
|
# page = doc[page_num]
|
|
|
|
|
# text = page.get_text()
|
|
|
|
|
# if text:
|
|
|
|
|
# cleaned_text = clean_page_content(text, common_header)
|
|
|
|
|
# print(cleaned_text)
|
|
|
|
|
# result += cleaned_text
|
|
|
|
|
# else:
|
|
|
|
|
# print(f"Page {page_num + 1} is empty or text could not be extracted.")
|
|
|
|
|
#
|
|
|
|
|
# doc.close()
|
|
|
|
|
# return result
|
|
|
|
|
|
|
|
|
|
|
2024-09-19 18:00:24 +08:00
|
|
|
|
if __name__ == '__main__':
|
2024-10-09 13:50:28 +08:00
|
|
|
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\广水农商行门禁控制主机及基础验证设备采购项目——磋商文件(定稿)(三次)_tobidders_notice_part2.pdf'
|
|
|
|
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
2024-10-09 20:51:33 +08:00
|
|
|
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
2024-10-18 15:44:18 +08:00
|
|
|
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
|
|
|
|
file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\交警支队机动车查验监管系统项目采购.pdf'
|
|
|
|
|
# ress = extract_common_header(file_path)
|
|
|
|
|
# print(ress)
|
2024-09-19 18:00:24 +08:00
|
|
|
|
res=extract_text_by_page(file_path)
|
2024-10-09 13:50:28 +08:00
|
|
|
|
# print(res)磋商文件_tobidders_notice_part2.pdf
|