This commit is contained in:
zy123 2024-10-24 16:55:39 +08:00
parent 01cd7977ec
commit 4344f9398e
3 changed files with 189 additions and 175 deletions

View File

@ -49,115 +49,118 @@ def pdf2docx(local_path_in):
print(f"format_change p2d:have downloaded file to: {downloaded_filepath}") print(f"format_change p2d:have downloaded file to: {downloaded_filepath}")
return downloaded_filepath return downloaded_filepath
# def doc2docx(local_path_in): def doc2docx(local_path_in):
# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d' remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2d'
# receive_download_url = upload_file(local_path_in, remote_url) receive_download_url = upload_file(local_path_in, remote_url)
# print(receive_download_url) print(receive_download_url)
# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
# local_filename = os.path.join(folder, filename) # 输出文件名 local_filename = os.path.join(folder, filename) # 输出文件名
# downloaded_filepath, file_type = download_file(receive_download_url, local_filename) downloaded_filepath, file_type = download_file(receive_download_url, local_filename)
# print(f"format_change d2d:have downloaded file to: {downloaded_filepath}") print(f"format_change d2d:have downloaded file to: {downloaded_filepath}")
# return downloaded_filepath return downloaded_filepath
# def docx2pdf(local_path_in): def docx2pdf(local_path_in):
# remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p' remote_url = 'http://47.98.59.178:8000/v3/3rd/files/transfer/d2p'
# receive_download_url = upload_file(local_path_in, remote_url) receive_download_url = upload_file(local_path_in, remote_url)
# filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹 filename, folder = get_filename_and_folder(local_path_in) # 输入输出在同一个文件夹
# local_filename = os.path.join(folder, filename) # 输出文件名 local_filename = os.path.join(folder, filename) # 输出文件名
# downloaded_filepath,file_type = download_file(receive_download_url, local_filename) downloaded_filepath,file_type = download_file(receive_download_url, local_filename)
# print(f"format_change d2p:have downloaded file to: {downloaded_filepath}") print(f"format_change d2p:have downloaded file to: {downloaded_filepath}")
# return downloaded_filepath return downloaded_filepath
def docx2pdf(file_path):
"""
将本地的 .docx .doc 文件转换为 .pdf 文件
参数 # def docx2pdf(file_path):
- file_path: str, 本地文件的路径支持 .docx .doc 格式 # """
""" # 将本地的 .docx 或 .doc 文件转换为 .pdf 文件。
# 检查文件是否存在 #
if not os.path.isfile(file_path): # 参数:
raise FileNotFoundError(f"文件未找到: {file_path}") # - file_path: str, 本地文件的路径,支持 .docx 和 .doc 格式。
# """
# 获取文件名和扩展名 # # 检查文件是否存在
base_name = os.path.basename(file_path) # if not os.path.isfile(file_path):
name, ext = os.path.splitext(base_name) # raise FileNotFoundError(f"文件未找到: {file_path}")
ext = ext.lower().lstrip('.') #
# # 获取文件名和扩展名
if ext not in ['docx', 'doc']: # base_name = os.path.basename(file_path)
raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}") # name, ext = os.path.splitext(base_name)
# ext = ext.lower().lstrip('.')
# 定义转换接口 #
endpoint = 'http://120.26.236.97:5008/convert_to_pdf' # if ext not in ['docx', 'doc']:
# raise ValueError(f"doc2pdf 仅支持 .docx 和 .doc 文件,当前文件扩展名为: .{ext}")
# 获取文件所在目录 #
output_dir = os.path.dirname(file_path) # # 定义转换接口
# endpoint = 'http://120.26.236.97:5008/convert_to_pdf'
# 准备上传的文件 #
with open(file_path, 'rb') as f: # # 获取文件所在目录
files = {'file': (base_name, f)} # output_dir = os.path.dirname(file_path)
try: #
print(f"正在将 {base_name} 转换为 .pdf 格式...") # # 准备上传的文件
response = requests.post(endpoint, files=files) # with open(file_path, 'rb') as f:
response.raise_for_status() # 检查请求是否成功 # files = {'file': (base_name, f)}
except requests.RequestException as e: # try:
print(f"转换过程中发生错误: {e}") # print(f"正在将 {base_name} 转换为 .pdf 格式...")
return # response = requests.post(endpoint, files=files)
# response.raise_for_status() # 检查请求是否成功
# 准备保存转换后文件的路径 # except requests.RequestException as e:
output_file_name = f"{name}.pdf" # print(f"转换过程中发生错误: {e}")
output_path = os.path.join(output_dir, output_file_name) # return
#
# 保存转换后的文件 # # 准备保存转换后文件的路径
with open(output_path, 'wb') as out_file: # output_file_name = f"{name}.pdf"
out_file.write(response.content) # output_path = os.path.join(output_dir, output_file_name)
#
print(f"文件已成功转换并保存至: {output_path}") # # 保存转换后的文件
# with open(output_path, 'wb') as out_file:
# out_file.write(response.content)
def doc2docx(file_path): #
""" # print(f"文件已成功转换并保存至: {output_path}")
将本地的 .doc 文件转换为 .docx 文件 # return output_path
#
参数 #
- file_path: str, 本地文件的路径支持 .doc 格式 # def doc2docx(file_path):
""" # """
# 检查文件是否存在 # 将本地的 .doc 文件转换为 .docx 文件。
if not os.path.isfile(file_path): #
raise FileNotFoundError(f"文件未找到: {file_path}") # 参数:
# - file_path: str, 本地文件的路径,支持 .doc 格式。
# 获取文件名和扩展名 # """
base_name = os.path.basename(file_path) # # 检查文件是否存在
name, ext = os.path.splitext(base_name) # if not os.path.isfile(file_path):
ext = ext.lower().lstrip('.') # raise FileNotFoundError(f"文件未找到: {file_path}")
#
if ext != 'doc': # # 获取文件名和扩展名
raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}") # base_name = os.path.basename(file_path)
# name, ext = os.path.splitext(base_name)
# 定义转换接口 # ext = ext.lower().lstrip('.')
endpoint = 'http://120.26.236.97:5008/convert_to_docx' #
# if ext != 'doc':
# 获取文件所在目录 # raise ValueError(f"doc2docx 仅支持 .doc 文件,当前文件扩展名为: .{ext}")
output_dir = os.path.dirname(file_path) #
# # 定义转换接口
# 准备上传的文件 # endpoint = 'http://120.26.236.97:5008/convert_to_docx'
with open(file_path, 'rb') as f: #
files = {'file': (base_name, f)} # # 获取文件所在目录
try: # output_dir = os.path.dirname(file_path)
print(f"正在将 {base_name} 转换为 .docx 格式...") #
response = requests.post(endpoint, files=files) # # 准备上传的文件
response.raise_for_status() # 检查请求是否成功 # with open(file_path, 'rb') as f:
except requests.RequestException as e: # files = {'file': (base_name, f)}
print(f"转换过程中发生错误: {e}") # try:
return # print(f"正在将 {base_name} 转换为 .docx 格式...")
# response = requests.post(endpoint, files=files)
# 准备保存转换后文件的路径 # response.raise_for_status() # 检查请求是否成功
output_file_name = f"{name}.docx" # except requests.RequestException as e:
output_path = os.path.join(output_dir, output_file_name) # print(f"转换过程中发生错误: {e}")
# return
# 保存转换后的文件 #
with open(output_path, 'wb') as out_file: # # 准备保存转换后文件的路径
out_file.write(response.content) # output_file_name = f"{name}.docx"
# output_path = os.path.join(output_dir, output_file_name)
print(f"文件已成功转换并保存至: {output_path}") #
# # 保存转换后的文件
# with open(output_path, 'wb') as out_file:
# out_file.write(response.content)
#
# print(f"文件已成功转换并保存至: {output_path}")
# return output_path
@ -168,8 +171,9 @@ if __name__ == '__main__':
# local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf" # local_path_in="C:\\Users\\Administrator\\Desktop\\fsdownload\\ztbfile.pdf"
# downloaded_file=doc2docx(local_path_in) # downloaded_file=doc2docx(local_path_in)
# downloaded_file=pdf2docx(local_path_in) # downloaded_file=pdf2docx(local_path_in)
downloaded_file=docx2pdf(local_path_in) for i in range(1):
print(downloaded_file) downloaded_file=docx2pdf(local_path_in)
print(downloaded_file)

View File

@ -1,8 +1,7 @@
from PyPDF2 import PdfReader, PdfWriter from PyPDF2 import PdfReader, PdfWriter
import re # 导入正则表达式库 import re # 导入正则表达式库
import os # 用于文件和文件夹操作 import os # 用于文件和文件夹操作
from flask_app.general.format_change import docx2pdf # 确保此模块可用 # from flask_app.general.format_change import docx2pdf # 确保此模块可用
from flask_app.general.merge_pdfs import merge_pdfs # 确保此模块可用
def clean_page_content(text, common_header): def clean_page_content(text, common_header):
""" """
@ -14,13 +13,14 @@ def clean_page_content(text, common_header):
# 替换首次出现的完整行 # 替换首次出现的完整行
text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1) text = re.sub(r'^' + re.escape(header_line.strip()) + r'\n?', '', text, count=1)
# 删除页码,支持多种格式 # 删除页码,合并多个正则表达式为一个
text = re.sub(r'^\s*\d+\s*(?=\D)', '', text) # 删除开头的页码,仅当紧跟非数字字符时 text = re.sub(
text = re.sub(r'\s+\d+\s*$', '', text) # 删除结尾的页码 r'(^\s*\d+\s*(?=\D))|(\s+\d+\s*$)|(\s*/\s*\d+\s*)|(\s*[—-]\s*\d+\s*[—-]\s*)',
text = re.sub(r'\s*\/\s*\d+\s*', '', text) # 删除形如 /129 的页码 '',
text = re.sub(r'\s*[—-]\s*\d+\s*[—-]\s*', '', text) # 删除形如 '—2—' 或 '-2-' 的页码 text,
flags=re.MULTILINE
)
return text return text
def extract_common_header(pdf_path): def extract_common_header(pdf_path):
""" """
从PDF的前几页提取公共抬头 从PDF的前几页提取公共抬头
@ -39,34 +39,31 @@ def extract_common_header(pdf_path):
if text: if text:
# 只取每页的前三行 # 只取每页的前三行
first_lines = text.strip().split('\n')[:3] first_lines = text.strip().split('\n')[:3]
headers.append(first_lines) headers.append(set(line.strip() for line in first_lines if line.strip()))
if len(headers) < 2: if len(headers) < 2:
return "" # 如果没有足够的页来比较,返回空字符串 return "" # 如果没有足够的页来比较,返回空字符串
# 寻找每一行中的公共部分,按顺序保留 # 寻找每一行中的公共部分,按顺序保留
common_headers = [] common_headers = set.intersection(*headers)
for lines in zip(*headers):
first_words = lines[0].split()
common_line = [word for word in first_words if all(word in line.split() for line in lines[1:])]
if common_line:
common_headers.append(' '.join(common_line))
return '\n'.join(common_headers) return '\n'.join(common_headers)
def is_pdf_or_doc(filename): def is_pdf_or_doc(filename):
""" """
判断文件是否为PDF或Word文档 判断文件是否为PDF或Word文档
""" """
return filename.lower().endswith(('.pdf', '.doc', '.docx')) return filename.lower().endswith(('.pdf', '.doc', '.docx'))
def convert_to_pdf(file_path):
""" # def convert_to_pdf(file_path):
如果是Word文档则转换为PDF # """
""" # 如果是Word文档则转换为PDF。
if file_path.lower().endswith(('.doc', '.docx')): # """
return docx2pdf(file_path) # if file_path.lower().endswith(('.doc', '.docx')):
return file_path # return docx2pdf(file_path)
# return file_path
def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix): def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix):
""" """
@ -76,8 +73,8 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
base_file_name = os.path.splitext(os.path.basename(pdf_path))[0] base_file_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf") output_pdf_path = os.path.join(output_folder, f"{base_file_name}_{output_suffix}.pdf")
if start_page < 0 or end_page >= len(pdf_document.pages) or start_page > end_page: if start_page is None or end_page is None or start_page > end_page:
print(f"无效的页面范围: {start_page}{end_page}") print(f"无效的页面范围: {start_page}{end_page} 文件: {pdf_path}")
return "" return ""
output_doc = PdfWriter() output_doc = PdfWriter()
@ -88,24 +85,32 @@ def save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_fo
print(f"{output_suffix} 已截取并保存页面从 {start_page}{end_page}{output_pdf_path}") print(f"{output_suffix} 已截取并保存页面从 {start_page}{end_page}{output_pdf_path}")
return output_pdf_path return output_pdf_path
except Exception as e: except Exception as e:
print(f"Error in save_extracted_pages: {e}") print(f"Error in save_extracted_pages for file {pdf_path}: {e}")
return "" # 返回空字符串 return "" # 返回空字符串
def extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
def find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header):
""" """
通用函数根据模式提取起始页和结束页 查找起始页和结束页的索引
""" """
start_page = None start_page = None
end_page = None end_page = None
total_pages = len(pdf_document.pages)
for i, page in enumerate(pdf_document.pages): for i, page in enumerate(pdf_document.pages):
if i <= begin_page:
continue
text = page.extract_text() or "" text = page.extract_text() or ""
cleaned_text = clean_page_content(text, common_header) cleaned_text = clean_page_content(text, common_header)
if start_page is None and re.search(begin_pattern, cleaned_text) and i > begin_page: if start_page is None and re.search(begin_pattern, cleaned_text):
start_page = i start_page = i
if start_page is not None and re.search(end_pattern, cleaned_text) and i > start_page: continue
if start_page is not None and re.search(end_pattern, cleaned_text):
end_page = i end_page = i
break break
return start_page, end_page
return start_page, end_page if end_page else total_pages - 1
def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
""" """
@ -114,37 +119,45 @@ def extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_patter
try: try:
common_header = extract_common_header(pdf_path) common_header = extract_common_header(pdf_path)
pdf_document = PdfReader(pdf_path) pdf_document = PdfReader(pdf_path)
total_pages = len(pdf_document.pages) - 1 # 获取总页数
# 提取起始页和结束页 start_page, end_page = find_page_indices(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
start_page, end_page = extract_pages_generic(pdf_document, begin_pattern, end_pattern, begin_page, common_header)
if output_suffix == "format": if output_suffix == "format" and start_page is None:
print(f"{output_suffix}: 未找到起始页,尝试二次提取! 文件: {pdf_path}")
# 二次提取逻辑,保留原有功能
start_page, end_page = find_page_indices(
pdf_document,
re.compile(r'^(?:响应|投标).*?格式.*', re.MULTILINE),
end_pattern,
begin_page,
common_header
)
if start_page is None: if start_page is None:
print(f"{output_suffix}: 未找到起始页,提取失败!") print(f"{output_suffix}: 未找到起始页,提取失败! 文件: {pdf_path}")
return "" return ""
if end_page is None:
# 如果未匹配到结束页,默认截取到文件末尾
end_page = total_pages
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。")
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
if start_page is None or end_page is None: if start_page is None:
print(f"first: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") print(f"未找到起始页,提取失败! 文件: {pdf_path}")
return "" return ""
if end_page is None:
end_page = len(pdf_document.pages) - 1
print(f"{output_suffix}: 未找到结束页,默认截取到文件末尾。 文件: {pdf_path}")
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix) return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
except Exception as e: except Exception as e:
print(f"Error processing {pdf_path}: {e}") print(f"Error processing {pdf_path}: {e}")
return "" return ""
def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): def process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
""" """
处理单个文件转换为PDF如果需要并提取页面 处理单个文件转换为PDF如果需要并提取页面
""" """
pdf_path = convert_to_pdf(file_path) # pdf_path = convert_to_pdf(file_path)
result = extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) pdf_path=file_path
return result or "" return extract_pages(pdf_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) or ""
def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix): def process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix):
""" """
@ -159,20 +172,15 @@ def process_input(input_path, output_folder, begin_pattern, begin_page, end_patt
file_path = os.path.join(input_path, file_name) file_path = os.path.join(input_path, file_name)
if is_pdf_or_doc(file_path): if is_pdf_or_doc(file_path):
result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) result = process_files(file_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if isinstance(result, tuple): generated_files.append(result)
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
else:
generated_files.append(result) # 直接添加result可能是空字符串
elif os.path.isfile(input_path) and is_pdf_or_doc(input_path): elif os.path.isfile(input_path) and is_pdf_or_doc(input_path):
result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix) result = process_files(input_path, output_folder, begin_pattern, begin_page, end_pattern, output_suffix)
if isinstance(result, tuple): generated_files.append(result)
generated_files.extend([f if f else "" for f in result]) # 保留空字符串
else:
generated_files.append(result) # 直接添加result可能是空字符串
else: else:
print("提供的路径既不是文件夹也不是PDF文件。") print("提供的路径既不是文件夹也不是PDF/Word文件。")
return [f for f in generated_files if f] # 过滤空字符串
return generated_files
def truncate_pdf_main(input_path, output_folder, selection): def truncate_pdf_main(input_path, output_folder, selection):
""" """
@ -180,27 +188,30 @@ def truncate_pdf_main(input_path, output_folder, selection):
""" """
try: try:
if selection == 1: # 投标文件格式 if selection == 1: # 投标文件格式
begin_page = 5 begin_page = 10
begin_pattern = re.compile( begin_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*' r'^第[一二三四五六七八九十百千]+(?:章|部分).*?(?:响应|投标).*?格式.*',
re.MULTILINE
) )
end_pattern = re.compile( end_pattern = re.compile(
r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+', re.MULTILINE r'^第[一二三四五六七八九十百千]+(?:章|部分)\s*[\u4e00-\u9fff]+',
re.MULTILINE
) )
local_output_suffix = "format" local_output_suffix = "format"
else: else:
print("无效的选择:请选择1") print("无效的选择:请选择1")
return None return []
# 调用相应的处理函数 # 调用相应的处理函数
return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix) or "" return process_input(input_path, output_folder, begin_pattern, begin_page, end_pattern, local_output_suffix)
except Exception as e: except Exception as e:
print(f"Error in truncate_pdf_main: {e}") print(f"Error in truncate_pdf_main: {e}")
return "" # 返回空字符串 return []
if __name__ == "__main__": if __name__ == "__main__":
# 定义输入和输出路径 # 定义输入和输出路径
input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles" input_path = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\包头市公安支队机动车查验监管系统招标文201907.pdf"
output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹" output_folder = "C:\\Users\\Administrator\\Desktop\\货物标\\zbfiles\\新建文件夹"
selection = 1 # 1 - 投标文件格式 selection = 1 # 1 - 投标文件格式
# 执行截取 # 执行截取

View File

@ -431,8 +431,7 @@ def extract_pages_twice(pdf_path, output_folder, output_suffix, common_header):
else: else:
print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!") print(f"second: {output_suffix} 未找到起始或结束页在文件 {pdf_path} 中!")
return "" return ""
return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix, return save_extracted_pages(pdf_document, start_page, end_page, pdf_path, output_folder, output_suffix)
)
except Exception as e: except Exception as e:
print(f"Error in extract_pages_twice: {e}") print(f"Error in extract_pages_twice: {e}")
return "" return ""