zbparse/flask_app/货物标/货物标截取pdf.py
2024-09-13 16:05:16 +08:00

194 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库
import os # 用于文件和文件夹操作
from flask_app.main.format_change import docx2pdf
def clean_page_content(text, common_header):
# 首先删除抬头公共部分
if common_header: # 确保有公共抬头才进行替换
for header_line in common_header.split('\n'):
if header_line.strip(): # 只处理非空行
# 替换首次出现的完整行
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
# 删除页码 eg:89/129 这个代码分三步走可以把89/129完全删除
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
return text
def extract_common_header(pdf_path):
pdf_document = PdfReader(pdf_path)
headers = []
total_pages = len(pdf_document.pages)
middle_page = total_pages // 2 # 计算中间页
# 确保中间页前后各一页共3页如果不足3页则尽可能取足
start_page = max(0, middle_page - 1)
num_pages_to_read = 3
for i in range(start_page, min(start_page + num_pages_to_read, total_pages)):
page = pdf_document.pages[i]
text = page.extract_text() or ""
if text:
# 只取每页的前三行
first_lines = text.strip().split('\n')[:3]
headers.append(first_lines)
if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串
# 寻找每一行中的公共部分
common_headers = []
for lines in zip(*headers):
# 在每一行中寻找公共单词
common_line = set(lines[0].split()).intersection(*[set(line.split()) for line in lines[1:]])
if common_line:
common_headers.append(' '.join(common_line))
return '\n'.join(common_headers)
def is_pdf_or_doc(filename):
# 判断文件是否为PDF或Word文档
return filename.lower().endswith(('.pdf', '.doc', '.docx'))
def convert_to_pdf(file_path):
# 假设 docx2pdf 函数已经被定义,这里仅根据文件扩展名来决定是否需要转换
if file_path.lower().endswith(('.doc', '.docx')):
return docx2pdf(file_path)
return file_path
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
generated_files = []
if os.path.isdir(input_path):
# 遍历文件夹内的所有PDF文件
for file in os.listdir(input_path):
file_path = os.path.join(input_path, file)
if is_pdf_or_doc(file):
pdf_path = convert_to_pdf(file_path)
output_pdf_path = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern,
output_suffix)
if output_pdf_path:
generated_files.append(output_pdf_path)
elif os.path.isfile(input_path) and input_path.endswith(".pdf"):
# 处理单个PDF文件
output_pdf_path = extract_pages(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if output_pdf_path:
generated_files.append(output_pdf_path)
else:
print("提供的路径既不是文件夹也不是PDF文件。")
return generated_files
def extract_pages_twice(pdf_path, output_folder, output_suffix,common_header):
pdf_document = PdfReader(pdf_path)
begin_page=5
start_page = None
end_page = None
# 定义用于检测是否包含"文件的构成"的正则表达式
exclusion_pattern = re.compile(r'文件的构成|文件的组成')
if output_suffix == "procurement":
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:采购|技术标准).*|'
r'^[一二三四五六七八九十百千]+、\s*采购清单',
re.MULTILINE
)
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
)
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text() or ""
cleaned_text = clean_page_content(text,common_header)
if re.search(begin_pattern, cleaned_text) and i > begin_page and not re.search(exclusion_pattern, cleaned_text):
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
if start_page is None or end_page is None:
print(f"twice: 未找到起始或结束页在文件 {pdf_path} 中!")
return ""
else:
return save_extracted_pages(pdf_document,start_page, end_page, pdf_path,output_folder, output_suffix)
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
try:
common_header = extract_common_header(pdf_path)
pdf_document = PdfReader(pdf_path)
start_page = None
end_page = None
for i in range(len(pdf_document.pages)):
page = pdf_document.pages[i]
text = page.extract_text() or ""
cleaned_text = clean_page_content(text,common_header)
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
end_page = i
break
if start_page is None or end_page is None:
if output_suffix == "procurement":
return extract_pages_twice(pdf_path, output_folder, output_suffix,common_header)
else:
print(f"未找到起始或结束页在文件 {pdf_path} 中!")
return None
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e:
print(f"Error processing {pdf_path}: {e}")
return None
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
output_doc = PdfWriter()
for page_num in range(start_page, end_page + 1):
output_doc.add_page(pdf_document.pages[page_num])
with open(output_pdf_path, 'wb') as f:
output_doc.write(f)
print(f"已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path
def truncate_pdf_main(input_path, output_folder, selection):
if selection == 1:
# 更新的正则表达式以匹配"第x章"和"第x部分",考虑到可能的空格和其他文字
begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:服务|项目|商务).*?要求|'
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?采购.*',
# r'^[一二三四五六七八九十百千]+、\s*采购清单',
)
begin_page = 5
end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+'
)
output_suffix = "procurement"
else:
print("无效的选择")
return None
# 调用相应的处理函数
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
def truncate_pdf_multiple(input_path, output_folder):
truncate_files = []
for selection in range(1, 2):
files = truncate_pdf_main(input_path, output_folder, selection)
truncate_files.extend(files)
return truncate_files
if __name__ == "__main__":
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(2).pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\output1"
truncate_pdf_multiple(input_path,output_folder)
# selection = 1 # 例如1 - 投标人须知前附表, 2 - 评标办法, 3 - 投标人须知正文 4-招标公告-合同条款前
# generated_files = truncate_pdf_main(input_path, output_folder, selection)
# print("生成的文件:", generated_files)