10.24
This commit is contained in:
parent
89d709f9d6
commit
772aaa17f0
@ -49,130 +49,132 @@ def pdf2docx(local_path_in):
|
|||||||
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
|
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
return downloaded_filepath
|
||||||
|
|
||||||
def doc2docx(local_path_in):
|
# def doc2docx(local_path_in):
|
||||||
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
|
# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
|
||||||
receive_download_url = upload_file(local_path_in, remote_url)
|
# receive_download_url = upload_file(local_path_in, remote_url)
|
||||||
print(receive_download_url)
|
# print(receive_download_url)
|
||||||
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
||||||
local_filename = os.path.join(folder, filename) # 输出文件名
|
# local_filename = os.path.join(folder, filename) # 输出文件名
|
||||||
downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
# downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
|
||||||
print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
|
# print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
# return downloaded_filepath
|
||||||
def docx2pdf(local_path_in):
|
# def docx2pdf(local_path_in):
|
||||||
remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
|
# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
|
||||||
receive_download_url = upload_file(local_path_in, remote_url)
|
# receive_download_url = upload_file(local_path_in, remote_url)
|
||||||
filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
|
||||||
local_filename = os.path.join(folder, filename) # 输出文件名
|
# local_filename = os.path.join(folder, filename) # 输出文件名
|
||||||
downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
|
# downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
|
||||||
print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
|
# print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
|
||||||
return downloaded_filepath
|
# return downloaded_filepath
|
||||||
# def docx2pdf(file_path):
|
|
||||||
# """
|
def docx2pdf(file_path):
|
||||||
# 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
|
"""
|
||||||
#
|
将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
|
||||||
# 参数:
|
|
||||||
# - file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。
|
参数:
|
||||||
# """
|
- file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。
|
||||||
# # 检查文件是否存在
|
"""
|
||||||
# if not os.path.isfile(file_path):
|
# 检查文件是否存在
|
||||||
# raise FileNotFoundError(f"文件未找到: {file_path}")
|
if not os.path.isfile(file_path):
|
||||||
#
|
raise FileNotFoundError(f"文件未找到: {file_path}")
|
||||||
# # 获取文件名和扩展名
|
|
||||||
# base_name = os.path.basename(file_path)
|
# 获取文件名和扩展名
|
||||||
# name, ext = os.path.splitext(base_name)
|
base_name = os.path.basename(file_path)
|
||||||
# ext = ext.lower().lstrip('.')
|
name, ext = os.path.splitext(base_name)
|
||||||
#
|
ext = ext.lower().lstrip('.')
|
||||||
# if ext not in ['docx', 'doc']:
|
|
||||||
# raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}")
|
if ext not in ['docx', 'doc']:
|
||||||
#
|
raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}")
|
||||||
# # 定义转换接口
|
|
||||||
# endpoint = 'http://120.26.236.97:5008/convert_to_pdf'
|
# 定义转换接口
|
||||||
#
|
endpoint = 'http://120.26.236.97:5008/convert_to_pdf'
|
||||||
# # 获取文件所在目录
|
# endpoint = 'http://192.168.0.2:5008/convert_to_pdf'
|
||||||
# output_dir = os.path.dirname(file_path)
|
|
||||||
#
|
# 获取文件所在目录
|
||||||
# # 准备上传的文件
|
output_dir = os.path.dirname(file_path)
|
||||||
# with open(file_path, 'rb') as f:
|
|
||||||
# files = {'file': (base_name, f)}
|
# 准备上传的文件
|
||||||
# try:
|
with open(file_path, 'rb') as f:
|
||||||
# print(f"正在将 {base_name} 转换为 .pdf 格式...")
|
files = {'file': (base_name, f)}
|
||||||
# response = requests.post(endpoint, files=files)
|
try:
|
||||||
# response.raise_for_status() # 检查请求是否成功
|
print(f"正在将 {base_name} 转换为 .pdf 格式...")
|
||||||
# except requests.RequestException as e:
|
response = requests.post(endpoint, files=files)
|
||||||
# print(f"转换过程中发生错误: {e}")
|
response.raise_for_status() # 检查请求是否成功
|
||||||
# return
|
except requests.RequestException as e:
|
||||||
#
|
print(f"转换过程中发生错误: {e}")
|
||||||
# # 准备保存转换后文件的路径
|
return
|
||||||
# output_file_name = f"{name}.pdf"
|
|
||||||
# output_path = os.path.join(output_dir, output_file_name)
|
# 准备保存转换后文件的路径
|
||||||
#
|
output_file_name = f"{name}.pdf"
|
||||||
# # 保存转换后的文件
|
output_path = os.path.join(output_dir, output_file_name)
|
||||||
# with open(output_path, 'wb') as out_file:
|
|
||||||
# out_file.write(response.content)
|
# 保存转换后的文件
|
||||||
#
|
with open(output_path, 'wb') as out_file:
|
||||||
# print(f"文件已成功转换并保存至: {output_path}")
|
out_file.write(response.content)
|
||||||
# return output_path
|
|
||||||
#
|
print(f"文件已成功转换并保存至: {output_path}")
|
||||||
#
|
return output_path
|
||||||
# def doc2docx(file_path):
|
|
||||||
# """
|
|
||||||
# 将本地的 .doc 文件转换为 .docx 文件。
|
def doc2docx(file_path):
|
||||||
#
|
"""
|
||||||
# 参数:
|
将本地的 .doc 文件转换为 .docx 文件。
|
||||||
# - file_path: str, 本地文件的路径,支持 .doc 格式。
|
|
||||||
# """
|
参数:
|
||||||
# # 检查文件是否存在
|
- file_path: str, 本地文件的路径,支持 .doc 格式。
|
||||||
# if not os.path.isfile(file_path):
|
"""
|
||||||
# raise FileNotFoundError(f"文件未找到: {file_path}")
|
# 检查文件是否存在
|
||||||
#
|
if not os.path.isfile(file_path):
|
||||||
# # 获取文件名和扩展名
|
raise FileNotFoundError(f"文件未找到: {file_path}")
|
||||||
# base_name = os.path.basename(file_path)
|
|
||||||
# name, ext = os.path.splitext(base_name)
|
# 获取文件名和扩展名
|
||||||
# ext = ext.lower().lstrip('.')
|
base_name = os.path.basename(file_path)
|
||||||
#
|
name, ext = os.path.splitext(base_name)
|
||||||
# if ext != 'doc':
|
ext = ext.lower().lstrip('.')
|
||||||
# raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}")
|
|
||||||
#
|
if ext != 'doc':
|
||||||
# # 定义转换接口
|
raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}")
|
||||||
# endpoint = 'http://120.26.236.97:5008/convert_to_docx'
|
|
||||||
#
|
# 定义转换接口
|
||||||
# # 获取文件所在目录
|
endpoint = 'http://120.26.236.97:5008/convert_to_docx'
|
||||||
# output_dir = os.path.dirname(file_path)
|
|
||||||
#
|
# 获取文件所在目录
|
||||||
# # 准备上传的文件
|
output_dir = os.path.dirname(file_path)
|
||||||
# with open(file_path, 'rb') as f:
|
|
||||||
# files = {'file': (base_name, f)}
|
# 准备上传的文件
|
||||||
# try:
|
with open(file_path, 'rb') as f:
|
||||||
# print(f"正在将 {base_name} 转换为 .docx 格式...")
|
files = {'file': (base_name, f)}
|
||||||
# response = requests.post(endpoint, files=files)
|
try:
|
||||||
# response.raise_for_status() # 检查请求是否成功
|
print(f"正在将 {base_name} 转换为 .docx 格式...")
|
||||||
# except requests.RequestException as e:
|
response = requests.post(endpoint, files=files)
|
||||||
# print(f"转换过程中发生错误: {e}")
|
response.raise_for_status() # 检查请求是否成功
|
||||||
# return
|
except requests.RequestException as e:
|
||||||
#
|
print(f"转换过程中发生错误: {e}")
|
||||||
# # 准备保存转换后文件的路径
|
return
|
||||||
# output_file_name = f"{name}.docx"
|
|
||||||
# output_path = os.path.join(output_dir, output_file_name)
|
# 准备保存转换后文件的路径
|
||||||
#
|
output_file_name = f"{name}.docx"
|
||||||
# # 保存转换后的文件
|
output_path = os.path.join(output_dir, output_file_name)
|
||||||
# with open(output_path, 'wb') as out_file:
|
|
||||||
# out_file.write(response.content)
|
# 保存转换后的文件
|
||||||
#
|
with open(output_path, 'wb') as out_file:
|
||||||
# print(f"文件已成功转换并保存至: {output_path}")
|
out_file.write(response.content)
|
||||||
# return output_path
|
|
||||||
|
print(f"文件已成功转换并保存至: {output_path}")
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# 替换为你的文件路径和API URL
|
# 替换为你的文件路径和API URL
|
||||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
|
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\1fbbb6ff-7ddc-40bb-8857-b7de37aece3f\\兴欣工程.pdf"
|
||||||
local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
|
# local_path_in = "C:\\Users\\Administrator\\Desktop\\fsdownload\\b151fcd0-4cd8-49b4-8de3-964057a9e653\\ztbfile.docx"
|
||||||
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
|
||||||
|
local_path_in ="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\招标文件.pdf"
|
||||||
# downloaded_file=doc2docx(local_path_in)
|
# downloaded_file=doc2docx(local_path_in)
|
||||||
# downloaded_file=pdf2docx(local_path_in)
|
downloaded_file=pdf2docx(local_path_in)
|
||||||
for i in range(1):
|
# downloaded_file=docx2pdf(local_path_in)
|
||||||
downloaded_file=docx2pdf(local_path_in)
|
print(downloaded_file)
|
||||||
print(downloaded_file)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -153,7 +153,7 @@ if __name__ == '__main__':
|
|||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\2-招标文件(2020年广水市中小学教师办公电脑系统及多媒体“班班通”设备采购安装项目)_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\output4\\磋商文件_tobidders_notice_part2.pdf'
|
||||||
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
# file_path = 'C:\\Users\\Administrator\\Desktop\\货物标\\截取test\\交警支队机动车查验监管系统项目采购_tobidders_notice_part1.pdf'
|
||||||
file_path='D:\\flask_project\\flask_app\\static\\output\\fee18877-0c60-4c28-911f-9a5f7d1325a7\\ztbfile_tobidders_notice_part2.pdf'
|
file_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\\zbtest12.pdf"
|
||||||
# ress = extract_common_header(file_path)
|
# ress = extract_common_header(file_path)
|
||||||
# print(ress)
|
# print(ress)
|
||||||
res=extract_text_by_page(file_path)
|
res=extract_text_by_page(file_path)
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
from PyPDF2 import PdfReader, PdfWriter
|
from PyPDF2 import PdfReader, PdfWriter
|
||||||
import re # 导入正则表达式库
|
import re # 导入正则表达式库
|
||||||
import os # 用于文件和文件夹操作
|
import os # 用于文件和文件夹操作
|
||||||
# from flask_app.general.format_change import docx2pdf # 确保此模块可用
|
|
||||||
|
|
||||||
def clean_page_content(text, common_header):
|
def clean_page_content(text, common_header):
|
||||||
"""
|
"""
|
||||||
@ -13,14 +12,14 @@ def clean_page_content(text, common_header):
|
|||||||
# 替换首次出现的完整行
|
# 替换首次出现的完整行
|
||||||
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
|
||||||
|
|
||||||
# 删除页码,合并多个正则表达式为一个
|
# 删除页码,支持多种格式
|
||||||
text = re.sub(
|
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时
|
||||||
r'(^\s*\d+\s*(?=\D))|(\s+\d+\s*$)|(\s*/\s*\d+\s*)|(\s*[—-]\s*\d+\s*[—-]\s*)',
|
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码
|
||||||
'',
|
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码
|
||||||
text,
|
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码
|
||||||
flags=re.MULTILINE
|
|
||||||
)
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def extract_common_header(pdf_path):
|
def extract_common_header(pdf_path):
|
||||||
"""
|
"""
|
||||||
从PDF的前几页提取公共抬头。
|
从PDF的前几页提取公共抬头。
|
||||||
@ -39,13 +38,19 @@ def extract_common_header(pdf_path):
|
|||||||
if text:
|
if text:
|
||||||
# 只取每页的前三行
|
# 只取每页的前三行
|
||||||
first_lines = text.strip().split('\n')[:3]
|
first_lines = text.strip().split('\n')[:3]
|
||||||
headers.append(set(line.strip() for line in first_lines if line.strip()))
|
headers.append(first_lines)
|
||||||
|
|
||||||
if len(headers) < 2:
|
if len(headers) < 2:
|
||||||
return "" # 如果没有足够的页来比较,返回空字符串
|
return "" # 如果没有足够的页来比较,返回空字符串
|
||||||
|
|
||||||
# 寻找每一行中的公共部分,按顺序保留
|
# 寻找每一行中的公共部分,按顺序保留
|
||||||
common_headers = set.intersection(*headers)
|
common_headers = []
|
||||||
|
for lines in zip(*headers):
|
||||||
|
first_words = lines[0].split()
|
||||||
|
common_line = [word for word in first_words if all(word in line.split() for line in lines[1:])]
|
||||||
|
if common_line:
|
||||||
|
common_headers.append(' '.join(common_line))
|
||||||
|
|
||||||
return '\n'.join(common_headers)
|
return '\n'.join(common_headers)
|
||||||
|
|
||||||
|
|
||||||
@ -67,13 +72,17 @@ def is_pdf_or_doc(filename):
|
|||||||
|
|
||||||
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
|
||||||
"""
|
"""
|
||||||
将提取的页面保存为新的PDF文件。
|
将提取的页面保存为新的PDF文件,统一命名为 "bid_format.pdf" 并存放在独立的子文件夹中。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
output_pdf_path = os.path.join(output_folder, "bid_format.pdf")
|
||||||
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
|
|
||||||
|
|
||||||
if start_page is None or end_page is None or start_page > end_page:
|
# 检查 start_page 和 end_page 是否为 None
|
||||||
|
if start_page is None or end_page is None:
|
||||||
|
print(f"起始页或结束页为 None: start_page={start_page}, end_page={end_page} 文件: {pdf_path}")
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if start_page < 0 or end_page >= len(pdf_document.pages) or start_page > end_page:
|
||||||
print(f"无效的页面范围: {start_page} 到 {end_page} 文件: {pdf_path}")
|
print(f"无效的页面范围: {start_page} 到 {end_page} 文件: {pdf_path}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
@ -89,29 +98,57 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
|
|||||||
return "" # 返回空字符串
|
return "" # 返回空字符串
|
||||||
|
|
||||||
|
|
||||||
def find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
|
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
|
||||||
"""
|
"""
|
||||||
查找起始页和结束页的索引。
|
通用函数,根据模式提取起始页和结束页。
|
||||||
"""
|
"""
|
||||||
start_page = None
|
start_page = None
|
||||||
end_page = None
|
end_page = None
|
||||||
total_pages = len(pdf_document.pages)
|
|
||||||
|
|
||||||
for i, page in enumerate(pdf_document.pages):
|
for i, page in enumerate(pdf_document.pages):
|
||||||
if i <= begin_page:
|
|
||||||
continue
|
|
||||||
text = page.extract_text() or ""
|
text = page.extract_text() or ""
|
||||||
cleaned_text = clean_page_content(text, common_header)
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
if start_page is None and re.search(begin_pattern, cleaned_text):
|
# print(cleaned_text)
|
||||||
|
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
start_page = i
|
start_page = i
|
||||||
continue
|
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
if start_page is not None and re.search(end_pattern, cleaned_text):
|
|
||||||
end_page = i
|
end_page = i
|
||||||
break
|
break
|
||||||
|
return start_page, end_page
|
||||||
|
|
||||||
return start_page, end_page if end_page else total_pages - 1
|
def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
|
||||||
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
||||||
|
begin_page = 10
|
||||||
|
start_page = None
|
||||||
|
end_page = None
|
||||||
|
begin_patterns = [
|
||||||
|
re.compile(r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*', re.MULTILINE),
|
||||||
|
re.compile(r"\s*(投标文件格式|响应文件格式|响应性文件格式)\s*")
|
||||||
|
]
|
||||||
|
exclusion_pattern = re.compile(r'文件的构成|文件的组成|目录|评标办法|技术标准|其他材料|合同')
|
||||||
|
end_pattern = re.compile(
|
||||||
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE)
|
||||||
|
for begin_pattern in begin_patterns:
|
||||||
|
for i, page in enumerate(pdf_document.pages):
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
cleaned_text = clean_page_content(text, common_header)
|
||||||
|
if exclusion_pattern and re.search(exclusion_pattern, cleaned_text):
|
||||||
|
continue
|
||||||
|
if re.search(begin_pattern, cleaned_text) and i > begin_page:
|
||||||
|
start_page = i
|
||||||
|
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page:
|
||||||
|
end_page = i
|
||||||
|
break
|
||||||
|
if start_page is not None:
|
||||||
|
break
|
||||||
|
if start_page is None:
|
||||||
|
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}")
|
||||||
|
return ""
|
||||||
|
if end_page is None:
|
||||||
|
# 如果未匹配到结束页,默认截取到文件末尾
|
||||||
|
end_page = total_pages
|
||||||
|
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}")
|
||||||
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
"""
|
"""
|
||||||
根据开始和结束模式提取PDF页面。
|
根据开始和结束模式提取PDF页面。
|
||||||
@ -119,31 +156,20 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
|
|||||||
try:
|
try:
|
||||||
common_header = extract_common_header(pdf_path)
|
common_header = extract_common_header(pdf_path)
|
||||||
pdf_document = PdfReader(pdf_path)
|
pdf_document = PdfReader(pdf_path)
|
||||||
|
total_pages = len(pdf_document.pages) - 1 # 获取总页数
|
||||||
|
|
||||||
start_page, end_page = find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
# 提取起始页和结束页
|
||||||
|
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
|
||||||
|
|
||||||
if output_suffix == "format" and start_page is None:
|
if output_suffix == "format":
|
||||||
print(f"{output_suffix}: 未找到起始页,尝试二次提取! 文件: {pdf_path}")
|
|
||||||
# 二次提取逻辑,保留原有功能
|
|
||||||
start_page, end_page = find_page_indices(
|
|
||||||
pdf_document,
|
|
||||||
re.compile(r'^(?:响应|投标).*?格式.*', re.MULTILINE),
|
|
||||||
end_pattern,
|
|
||||||
begin_page,
|
|
||||||
common_header
|
|
||||||
)
|
|
||||||
if start_page is None:
|
if start_page is None:
|
||||||
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}")
|
print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path} 尝试二次提取!")
|
||||||
return ""
|
return extract_pages_twice(pdf_path,output_folder,output_suffix,common_header)
|
||||||
|
if end_page is None:
|
||||||
if start_page is None:
|
# 如果未匹配到结束页,默认截取到文件末尾
|
||||||
print(f"未找到起始页,提取失败! 文件: {pdf_path}")
|
end_page = total_pages
|
||||||
return ""
|
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}")
|
||||||
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
if end_page is None:
|
|
||||||
end_page = len(pdf_document.pages) - 1
|
|
||||||
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}")
|
|
||||||
|
|
||||||
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error processing {pdf_path}: {e}")
|
print(f"Error processing {pdf_path}: {e}")
|
||||||
@ -156,7 +182,8 @@ def process_files(file_path, output_folder, begin_pattern, begin_page, end_patte
|
|||||||
"""
|
"""
|
||||||
# pdf_path = convert_to_pdf(file_path)
|
# pdf_path = convert_to_pdf(file_path)
|
||||||
pdf_path=file_path
|
pdf_path=file_path
|
||||||
return extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or ""
|
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
|
return result or ""
|
||||||
|
|
||||||
|
|
||||||
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
|
||||||
@ -172,47 +199,52 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
|
|||||||
file_path = os.path.join(input_path, file_name)
|
file_path = os.path.join(input_path, file_name)
|
||||||
if is_pdf_or_doc(file_path):
|
if is_pdf_or_doc(file_path):
|
||||||
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
generated_files.append(result)
|
if isinstance(result, tuple):
|
||||||
|
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
|
||||||
|
else:
|
||||||
|
generated_files.append(result) # 直接添加result,可能是空字符串
|
||||||
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
|
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
|
||||||
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
|
||||||
generated_files.append(result)
|
if isinstance(result, tuple):
|
||||||
|
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
|
||||||
|
else:
|
||||||
|
generated_files.append(result) # 直接添加result,可能是空字符串
|
||||||
else:
|
else:
|
||||||
print("提供的路径既不是文件夹也不是PDF/Word文件。")
|
print("提供的路径既不是文件夹也不是PDF文件。")
|
||||||
return [f for f in generated_files if f] # 过滤空字符串
|
|
||||||
|
return generated_files
|
||||||
|
|
||||||
|
|
||||||
def truncate_pdf_main(input_path, output_folder, selection):
|
def truncate_pdf_main(input_path, output_folder, selection,begin_page=10):
|
||||||
"""
|
"""
|
||||||
主函数,根据选择截取PDF内容。
|
主函数,根据选择截取PDF内容。
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
if selection == 1: # 投标文件格式
|
if selection == 1: # 投标文件格式
|
||||||
begin_page = 10
|
|
||||||
begin_pattern = re.compile(
|
begin_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
|
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*'
|
||||||
)
|
)
|
||||||
end_pattern = re.compile(
|
end_pattern = re.compile(
|
||||||
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
|
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE
|
||||||
re.MULTILINE
|
|
||||||
)
|
)
|
||||||
local_output_suffix = "format"
|
local_output_suffix = "format"
|
||||||
else:
|
else:
|
||||||
print("无效的选择:请选择1")
|
print("无效的选择:请选择1")
|
||||||
return []
|
return None
|
||||||
|
|
||||||
# 调用相应的处理函数
|
# 调用相应的处理函数
|
||||||
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix)
|
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix) or ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error in truncate_pdf_main: {e}")
|
print(f"Error in truncate_pdf_main: {e}")
|
||||||
return []
|
return "" # 返回空字符串
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# 定义输入和输出路径
|
# 定义输入和输出路径
|
||||||
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles"
|
# input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\招标文件(2).pdf"
|
||||||
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹"
|
input_path="C:\\Users\\Administrator\\Desktop\\招标文件\\招标test文件夹\\zbtest12.pdf"
|
||||||
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
|
||||||
selection = 1 # 1 - 投标文件格式
|
selection = 1 # 1 - 投标文件格式
|
||||||
# 执行截取
|
# 执行截取
|
||||||
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
generated_files = truncate_pdf_main(input_path, output_folder, selection)
|
||||||
# print("生成的文件:", generated_files)
|
print("生成的文件:", generated_files)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user